diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15543,8 +15543,8 @@ that is used by the conditional branch controlling the loop. -Experimental Vector Reduction Intrinsics ----------------------------------------- +Vector Reduction Intrinsics +--------------------------- Horizontal reductions of vectors can be expressed using the following intrinsics. Each one takes a vector operand as an input and applies its @@ -15552,7 +15552,7 @@ scalar result of the same element type. -'``llvm.experimental.vector.reduce.add.*``' Intrinsic +'``llvm.vector.reduce.add.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15560,13 +15560,13 @@ :: - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a) - declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a) + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) + declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD`` +The '``llvm.vector.reduce.add.*``' intrinsics do an integer ``ADD`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15574,7 +15574,7 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.v2.fadd.*``' Intrinsic +'``llvm.vector.reduce.fadd.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15582,13 +15582,13 @@ :: - declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %a) - declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %start_value, <2 x double> %a) + declare float @llvm.vector.reduce.fadd.f32.v4f32(float %start_value, <4 x float> %a) + declare double @llvm.vector.reduce.fadd.f64.v2f64(double %start_value, <2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.v2.fadd.*``' intrinsics do a floating-point +The '``llvm.vector.reduce.fadd.*``' intrinsics do a floating-point ``ADD`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15619,11 +15619,11 @@ :: - %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %input) ; unordered reduction - %ord = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction + %unord = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %input) ; unordered reduction + %ord = call float @llvm.vector.reduce.fadd.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction -'``llvm.experimental.vector.reduce.mul.*``' Intrinsic +'``llvm.vector.reduce.mul.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15631,13 +15631,13 @@ :: - declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %a) - declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %a) + declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a) + declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL`` +The '``llvm.vector.reduce.mul.*``' intrinsics do an integer ``MUL`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15645,7 +15645,7 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.v2.fmul.*``' Intrinsic +'``llvm.vector.reduce.fmul.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15653,13 +15653,13 @@ :: - declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %a) - declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %start_value, <2 x double> %a) + declare float @llvm.vector.reduce.fmul.f32.v4f32(float %start_value, <4 x float> %a) + declare double @llvm.vector.reduce.fmul.f64.v2f64(double %start_value, <2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.v2.fmul.*``' intrinsics do a floating-point +The '``llvm.vector.reduce.fmul.*``' intrinsics do a floating-point ``MUL`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15690,10 +15690,10 @@ :: - %unord = call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %input) ; unordered reduction - %ord = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction + %unord = call reassoc float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %input) ; unordered reduction + %ord = call float @llvm.vector.reduce.fmul.f32.v4f32(float %start_value, <4 x float> %input) ; ordered reduction -'``llvm.experimental.vector.reduce.and.*``' Intrinsic +'``llvm.vector.reduce.and.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15701,12 +15701,12 @@ :: - declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a) + declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND`` +The '``llvm.vector.reduce.and.*``' intrinsics do a bitwise ``AND`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15714,7 +15714,7 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.or.*``' Intrinsic +'``llvm.vector.reduce.or.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15722,12 +15722,12 @@ :: - declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a) + declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction +The '``llvm.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15735,7 +15735,7 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.xor.*``' Intrinsic +'``llvm.vector.reduce.xor.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15743,12 +15743,12 @@ :: - declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %a) + declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR`` +The '``llvm.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15756,7 +15756,7 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.smax.*``' Intrinsic +'``llvm.vector.reduce.smax.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15764,12 +15764,12 @@ :: - declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a) + declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer +The '``llvm.vector.reduce.smax.*``' intrinsics do a signed integer ``MAX`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15777,7 +15777,7 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.smin.*``' Intrinsic +'``llvm.vector.reduce.smin.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15785,12 +15785,12 @@ :: - declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a) + declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer +The '``llvm.vector.reduce.smin.*``' intrinsics do a signed integer ``MIN`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15798,7 +15798,7 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.umax.*``' Intrinsic +'``llvm.vector.reduce.umax.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15806,12 +15806,12 @@ :: - declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a) + declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned +The '``llvm.vector.reduce.umax.*``' intrinsics do an unsigned integer ``MAX`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15819,7 +15819,7 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.umin.*``' Intrinsic +'``llvm.vector.reduce.umin.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15827,12 +15827,12 @@ :: - declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a) + declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned +The '``llvm.vector.reduce.umin.*``' intrinsics do an unsigned integer ``MIN`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15840,7 +15840,7 @@ """""""""" The argument to this intrinsic must be a vector of integer values. -'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic +'``llvm.vector.reduce.fmax.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15848,13 +15848,13 @@ :: - declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a) - declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a) + declare float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) + declare double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating-point +The '``llvm.vector.reduce.fmax.*``' intrinsics do a floating-point ``MAX`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. @@ -15870,7 +15870,7 @@ """""""""" The argument to this intrinsic must be a vector of floating-point values. -'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic +'``llvm.vector.reduce.fmin.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -15879,13 +15879,13 @@ :: - declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a) - declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a) + declare float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) + declare double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) Overview: """"""""" -The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating-point +The '``llvm.vector.reduce.fmin.*``' intrinsics do a floating-point ``MIN`` reduction of a vector, returning the result as a scalar. The return type matches the element-type of the vector input. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -63,6 +63,10 @@ * Added the ``byref`` attribute to better represent argument passing for the `amdgpu_kernel` calling convention. +* The ``llvm.experimental.vector.reduce`` family of intrinsics have been renamed + to drop the "experimental" from the name, reflecting their now fully supported + status in the IR. + Changes to building LLVM ------------------------ diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1180,19 +1180,19 @@ return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], VarMask, Alignment, CostKind, I); } - case Intrinsic::experimental_vector_reduce_add: - case Intrinsic::experimental_vector_reduce_mul: - case Intrinsic::experimental_vector_reduce_and: - case Intrinsic::experimental_vector_reduce_or: - case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_v2_fadd: - case Intrinsic::experimental_vector_reduce_v2_fmul: - case Intrinsic::experimental_vector_reduce_smax: - case Intrinsic::experimental_vector_reduce_smin: - case Intrinsic::experimental_vector_reduce_fmax: - case Intrinsic::experimental_vector_reduce_fmin: - case Intrinsic::experimental_vector_reduce_umax: - case Intrinsic::experimental_vector_reduce_umin: { + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmin: + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: { IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I); return getIntrinsicInstrCost(Attrs, CostKind); } @@ -1407,46 +1407,46 @@ return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, CostKind); } - case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::vector_reduce_add: return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, /*IsPairwiseForm=*/false, CostKind); - case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::vector_reduce_mul: return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, /*IsPairwiseForm=*/false, CostKind); - case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::vector_reduce_and: return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, /*IsPairwiseForm=*/false, CostKind); - case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::vector_reduce_or: return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, /*IsPairwiseForm=*/false, CostKind); - case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::vector_reduce_xor: return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, /*IsPairwiseForm=*/false, CostKind); - case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::vector_reduce_fadd: // FIXME: Add new flag for cost of strict reductions. return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, /*IsPairwiseForm=*/false, CostKind); - case Intrinsic::experimental_vector_reduce_v2_fmul: + case Intrinsic::vector_reduce_fmul: // FIXME: Add new flag for cost of strict reductions. return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, /*IsPairwiseForm=*/false, CostKind); - case Intrinsic::experimental_vector_reduce_smax: - case Intrinsic::experimental_vector_reduce_smin: - case Intrinsic::experimental_vector_reduce_fmax: - case Intrinsic::experimental_vector_reduce_fmin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmin: return thisT()->getMinMaxReductionCost( VecOpTy, cast(CmpInst::makeCmpResultType(VecOpTy)), /*IsPairwiseForm=*/false, /*IsUnsigned=*/false, CostKind); - case Intrinsic::experimental_vector_reduce_umax: - case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: return thisT()->getMinMaxReductionCost( VecOpTy, cast(CmpInst::makeCmpResultType(VecOpTy)), /*IsPairwiseForm=*/false, diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1452,34 +1452,35 @@ //===------------------------ Reduction Intrinsics ------------------------===// // let IntrProperties = [IntrNoMem, IntrWillReturn] in { - def int_experimental_vector_reduce_v2_fadd : Intrinsic<[llvm_anyfloat_ty], - [LLVMMatchType<0>, - llvm_anyvector_ty]>; - def int_experimental_vector_reduce_v2_fmul : Intrinsic<[llvm_anyfloat_ty], - [LLVMMatchType<0>, - llvm_anyvector_ty]>; - def int_experimental_vector_reduce_add : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_mul : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_and : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_or : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_xor : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_smax : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_smin : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_umax : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_umin : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_fmax : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; - def int_experimental_vector_reduce_fmin : Intrinsic<[LLVMVectorElementType<0>], - [llvm_anyvector_ty]>; + + def int_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty]>; + def int_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty]>; + def int_vector_reduce_add : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_mul : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_and : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_or : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_xor : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_smax : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_smin : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_umax : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_umin : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_fmax : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; + def int_vector_reduce_fmin : Intrinsic<[LLVMVectorElementType<0>], + [llvm_anyvector_ty]>; } //===----- Matrix intrinsics ---------------------------------------------===// diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1457,15 +1457,15 @@ case Intrinsic::smul_fix_sat: case Intrinsic::bitreverse: case Intrinsic::is_constant: - case Intrinsic::experimental_vector_reduce_add: - case Intrinsic::experimental_vector_reduce_mul: - case Intrinsic::experimental_vector_reduce_and: - case Intrinsic::experimental_vector_reduce_or: - case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_smin: - case Intrinsic::experimental_vector_reduce_smax: - case Intrinsic::experimental_vector_reduce_umin: - case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: // Target intrinsics case Intrinsic::arm_mve_vctp8: case Intrinsic::arm_mve_vctp16: @@ -1711,31 +1711,31 @@ return nullptr; const APInt &X = CI->getValue(); switch (IID) { - case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::vector_reduce_add: Acc = Acc + X; break; - case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::vector_reduce_mul: Acc = Acc * X; break; - case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::vector_reduce_and: Acc = Acc & X; break; - case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::vector_reduce_or: Acc = Acc | X; break; - case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::vector_reduce_xor: Acc = Acc ^ X; break; - case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::vector_reduce_smin: Acc = APIntOps::smin(Acc, X); break; - case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::vector_reduce_smax: Acc = APIntOps::smax(Acc, X); break; - case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::vector_reduce_umin: Acc = APIntOps::umin(Acc, X); break; - case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::vector_reduce_umax: Acc = APIntOps::umax(Acc, X); break; } @@ -2240,15 +2240,15 @@ if (isa(Operands[0])) { switch (IntrinsicID) { default: break; - case Intrinsic::experimental_vector_reduce_add: - case Intrinsic::experimental_vector_reduce_mul: - case Intrinsic::experimental_vector_reduce_and: - case Intrinsic::experimental_vector_reduce_or: - case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_smin: - case Intrinsic::experimental_vector_reduce_smax: - case Intrinsic::experimental_vector_reduce_umin: - case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: return ConstantInt::get(Ty, 0); } } @@ -2259,15 +2259,15 @@ auto *Op = cast(Operands[0]); switch (IntrinsicID) { default: break; - case Intrinsic::experimental_vector_reduce_add: - case Intrinsic::experimental_vector_reduce_mul: - case Intrinsic::experimental_vector_reduce_and: - case Intrinsic::experimental_vector_reduce_or: - case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_smin: - case Intrinsic::experimental_vector_reduce_smax: - case Intrinsic::experimental_vector_reduce_umin: - case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_umax: if (Constant *C = ConstantFoldVectorReduce(IntrinsicID, Op)) return C; break; diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp --- a/llvm/lib/CodeGen/ExpandReductions.cpp +++ b/llvm/lib/CodeGen/ExpandReductions.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This pass implements IR expansion for reduction intrinsics, allowing targets -// to enable the experimental intrinsics until just before codegen. +// to enable the intrinsics until just before codegen. // //===----------------------------------------------------------------------===// @@ -30,27 +30,27 @@ unsigned getOpcode(Intrinsic::ID ID) { switch (ID) { - case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::vector_reduce_fadd: return Instruction::FAdd; - case Intrinsic::experimental_vector_reduce_v2_fmul: + case Intrinsic::vector_reduce_fmul: return Instruction::FMul; - case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::vector_reduce_add: return Instruction::Add; - case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::vector_reduce_mul: return Instruction::Mul; - case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::vector_reduce_and: return Instruction::And; - case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::vector_reduce_or: return Instruction::Or; - case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::vector_reduce_xor: return Instruction::Xor; - case Intrinsic::experimental_vector_reduce_smax: - case Intrinsic::experimental_vector_reduce_smin: - case Intrinsic::experimental_vector_reduce_umax: - case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: return Instruction::ICmp; - case Intrinsic::experimental_vector_reduce_fmax: - case Intrinsic::experimental_vector_reduce_fmin: + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmin: return Instruction::FCmp; default: llvm_unreachable("Unexpected ID"); @@ -59,17 +59,17 @@ RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) { switch (ID) { - case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::vector_reduce_smax: return RecurrenceDescriptor::MRK_SIntMax; - case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::vector_reduce_smin: return RecurrenceDescriptor::MRK_SIntMin; - case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::vector_reduce_umax: return RecurrenceDescriptor::MRK_UIntMax; - case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::vector_reduce_umin: return RecurrenceDescriptor::MRK_UIntMin; - case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::vector_reduce_fmax: return RecurrenceDescriptor::MRK_FloatMax; - case Intrinsic::experimental_vector_reduce_fmin: + case Intrinsic::vector_reduce_fmin: return RecurrenceDescriptor::MRK_FloatMin; default: return RecurrenceDescriptor::MRK_Invalid; @@ -83,19 +83,19 @@ if (auto *II = dyn_cast(&I)) { switch (II->getIntrinsicID()) { default: break; - case Intrinsic::experimental_vector_reduce_v2_fadd: - case Intrinsic::experimental_vector_reduce_v2_fmul: - case Intrinsic::experimental_vector_reduce_add: - case Intrinsic::experimental_vector_reduce_mul: - case Intrinsic::experimental_vector_reduce_and: - case Intrinsic::experimental_vector_reduce_or: - case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_smax: - case Intrinsic::experimental_vector_reduce_smin: - case Intrinsic::experimental_vector_reduce_umax: - case Intrinsic::experimental_vector_reduce_umin: - case Intrinsic::experimental_vector_reduce_fmax: - case Intrinsic::experimental_vector_reduce_fmin: + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmin: if (TTI->shouldExpandReduction(II)) Worklist.push_back(II); @@ -116,8 +116,8 @@ Builder.setFastMathFlags(FMF); switch (ID) { default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::experimental_vector_reduce_v2_fadd: - case Intrinsic::experimental_vector_reduce_v2_fmul: { + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: { // FMFs must be attached to the call, otherwise it's an ordered reduction // and it can't be handled by generating a shuffle sequence. Value *Acc = II->getArgOperand(0); @@ -135,15 +135,15 @@ } break; } - case Intrinsic::experimental_vector_reduce_add: - case Intrinsic::experimental_vector_reduce_mul: - case Intrinsic::experimental_vector_reduce_and: - case Intrinsic::experimental_vector_reduce_or: - case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_smax: - case Intrinsic::experimental_vector_reduce_smin: - case Intrinsic::experimental_vector_reduce_umax: - case Intrinsic::experimental_vector_reduce_umin: { + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: { Value *Vec = II->getArgOperand(0); if (!isPowerOf2_32( cast(Vec->getType())->getNumElements())) @@ -152,8 +152,8 @@ Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); break; } - case Intrinsic::experimental_vector_reduce_fmax: - case Intrinsic::experimental_vector_reduce_fmin: { + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmin: { // FIXME: We only expand 'fast' reductions here because the underlying // code in createMinMaxOp() assumes that comparisons use 'fast' // semantics. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6761,19 +6761,19 @@ LowerDeoptimizeCall(&I); return; - case Intrinsic::experimental_vector_reduce_v2_fadd: - case Intrinsic::experimental_vector_reduce_v2_fmul: - case Intrinsic::experimental_vector_reduce_add: - case Intrinsic::experimental_vector_reduce_mul: - case Intrinsic::experimental_vector_reduce_and: - case Intrinsic::experimental_vector_reduce_or: - case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_smax: - case Intrinsic::experimental_vector_reduce_smin: - case Intrinsic::experimental_vector_reduce_umax: - case Intrinsic::experimental_vector_reduce_umin: - case Intrinsic::experimental_vector_reduce_fmax: - case Intrinsic::experimental_vector_reduce_fmin: + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmin: visitVectorReduce(I, Intrinsic); return; @@ -8936,7 +8936,7 @@ SDFlags.copyFMF(*FPMO); switch (Intrinsic) { - case Intrinsic::experimental_vector_reduce_v2_fadd: + case Intrinsic::vector_reduce_fadd: if (SDFlags.hasAllowReassociation()) Res = DAG.getNode(ISD::FADD, dl, VT, Op1, DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2, SDFlags), @@ -8944,7 +8944,7 @@ else Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2, SDFlags); break; - case Intrinsic::experimental_vector_reduce_v2_fmul: + case Intrinsic::vector_reduce_fmul: if (SDFlags.hasAllowReassociation()) Res = DAG.getNode(ISD::FMUL, dl, VT, Op1, DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2, SDFlags), @@ -8952,37 +8952,37 @@ else Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2, SDFlags); break; - case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::vector_reduce_add: Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1); break; - case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::vector_reduce_mul: Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1); break; - case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::vector_reduce_and: Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1); break; - case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::vector_reduce_or: Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1); break; - case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::vector_reduce_xor: Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1); break; - case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::vector_reduce_smax: Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1); break; - case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::vector_reduce_smin: Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1); break; - case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::vector_reduce_umax: Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1); break; - case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::vector_reduce_umin: Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1); break; - case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::vector_reduce_fmax: Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags); break; - case Intrinsic::experimental_vector_reduce_fmin: + case Intrinsic::vector_reduce_fmin: Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags); break; default: diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/IntrinsicsX86.h" @@ -717,14 +718,44 @@ } case 'e': { SmallVector Groups; - static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[fi][0-9]+"); + static const Regex R("^experimental.vector.reduce.([a-z]+)\\.[a-z][0-9]+"); if (R.match(Name, &Groups)) { + Intrinsic::ID ID; + ID = StringSwitch(Groups[1]) + .Case("add", Intrinsic::vector_reduce_add) + .Case("mul", Intrinsic::vector_reduce_mul) + .Case("and", Intrinsic::vector_reduce_and) + .Case("or", Intrinsic::vector_reduce_or) + .Case("xor", Intrinsic::vector_reduce_xor) + .Case("smax", Intrinsic::vector_reduce_smax) + .Case("smin", Intrinsic::vector_reduce_smin) + .Case("umax", Intrinsic::vector_reduce_umax) + .Case("umin", Intrinsic::vector_reduce_umin) + .Case("fmax", Intrinsic::vector_reduce_fmax) + .Case("fmin", Intrinsic::vector_reduce_fmin) + .Default(Intrinsic::not_intrinsic); + if (ID != Intrinsic::not_intrinsic) { + rename(F); + auto Args = F->getFunctionType()->params(); + SmallVector Tys; + if (ID == Intrinsic::vector_reduce_fmul || + ID == Intrinsic::vector_reduce_fadd) + Tys = {F->getFunctionType()->getReturnType(), Args[1]}; + else + Tys = {Args[0]}; + NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + return true; + } + } + static const Regex R2( + "^experimental.vector.reduce.v2.([a-z]+)\\.[fi][0-9]+"); + Groups.clear(); + if (R2.match(Name, &Groups)) { Intrinsic::ID ID = Intrinsic::not_intrinsic; if (Groups[1] == "fadd") - ID = Intrinsic::experimental_vector_reduce_v2_fadd; + ID = Intrinsic::vector_reduce_fadd; if (Groups[1] == "fmul") - ID = Intrinsic::experimental_vector_reduce_v2_fmul; - + ID = Intrinsic::vector_reduce_fmul; if (ID != Intrinsic::not_intrinsic) { rename(F); auto Args = F->getFunctionType()->params(); @@ -3620,28 +3651,6 @@ DefaultCase(); return; } - case Intrinsic::experimental_vector_reduce_v2_fmul: { - SmallVector Args; - if (CI->isFast()) - Args.push_back(ConstantFP::get(CI->getOperand(0)->getType(), 1.0)); - else - Args.push_back(CI->getOperand(0)); - Args.push_back(CI->getOperand(1)); - NewCall = Builder.CreateCall(NewFn, Args); - cast(NewCall)->copyFastMathFlags(CI); - break; - } - case Intrinsic::experimental_vector_reduce_v2_fadd: { - SmallVector Args; - if (CI->isFast()) - Args.push_back(Constant::getNullValue(CI->getOperand(0)->getType())); - else - Args.push_back(CI->getOperand(0)); - Args.push_back(CI->getOperand(1)); - NewCall = Builder.CreateCall(NewFn, Args); - cast(NewCall)->copyFastMathFlags(CI); - break; - } case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -326,8 +326,7 @@ Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; Type *Tys[] = {Acc->getType(), Src->getType()}; - auto Decl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_vector_reduce_v2_fadd, Tys); + auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fadd, Tys); return createCallHelper(Decl, Ops, this); } @@ -335,51 +334,44 @@ Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; Type *Tys[] = {Acc->getType(), Src->getType()}; - auto Decl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_vector_reduce_v2_fmul, Tys); + auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fmul, Tys); return createCallHelper(Decl, Ops, this); } CallInst *IRBuilderBase::CreateAddReduce(Value *Src) { - return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add, - Src); + return getReductionIntrinsic(this, Intrinsic::vector_reduce_add, Src); } CallInst *IRBuilderBase::CreateMulReduce(Value *Src) { - return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul, - Src); + return getReductionIntrinsic(this, Intrinsic::vector_reduce_mul, Src); } CallInst *IRBuilderBase::CreateAndReduce(Value *Src) { - return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and, - Src); + return getReductionIntrinsic(this, Intrinsic::vector_reduce_and, Src); } CallInst *IRBuilderBase::CreateOrReduce(Value *Src) { - return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or, - Src); + return getReductionIntrinsic(this, Intrinsic::vector_reduce_or, Src); } CallInst *IRBuilderBase::CreateXorReduce(Value *Src) { - return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor, - Src); + return getReductionIntrinsic(this, Intrinsic::vector_reduce_xor, Src); } CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) { - auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax - : Intrinsic::experimental_vector_reduce_umax; + auto ID = + IsSigned ? Intrinsic::vector_reduce_smax : Intrinsic::vector_reduce_umax; return getReductionIntrinsic(this, ID, Src); } CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) { - auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin - : Intrinsic::experimental_vector_reduce_umin; + auto ID = + IsSigned ? Intrinsic::vector_reduce_smin : Intrinsic::vector_reduce_umin; return getReductionIntrinsic(this, ID, Src); } CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) { - auto Rdx = getReductionIntrinsic( - this, Intrinsic::experimental_vector_reduce_fmax, Src); + auto Rdx = getReductionIntrinsic(this, Intrinsic::vector_reduce_fmax, Src); if (NoNaN) { FastMathFlags FMF; FMF.setNoNaNs(); @@ -389,8 +381,7 @@ } CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) { - auto Rdx = getReductionIntrinsic( - this, Intrinsic::experimental_vector_reduce_fmin, Src); + auto Rdx = getReductionIntrinsic(this, Intrinsic::vector_reduce_fmin, Src); if (NoNaN) { FastMathFlags FMF; FMF.setNoNaNs(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -219,8 +219,8 @@ bool shouldExpandReduction(const IntrinsicInst *II) const { switch (II->getIntrinsicID()) { - case Intrinsic::experimental_vector_reduce_v2_fadd: - case Intrinsic::experimental_vector_reduce_v2_fmul: + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: // We don't have legalization support for ordered FP reductions. return !II->getFastMathFlags().allowReassoc(); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -195,8 +195,8 @@ bool shouldExpandReduction(const IntrinsicInst *II) const { switch (II->getIntrinsicID()) { - case Intrinsic::experimental_vector_reduce_v2_fadd: - case Intrinsic::experimental_vector_reduce_v2_fmul: + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: // We don't have legalization support for ordered FP reductions. return !II->getFastMathFlags().allowReassoc(); default: diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -270,7 +270,7 @@ case Intrinsic::uadd_sat: case Intrinsic::ssub_sat: case Intrinsic::usub_sat: - case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::vector_reduce_add: continue; case Intrinsic::fma: case Intrinsic::trunc: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1824,8 +1824,7 @@ } auto m_AddRdx = [](Value *&Vec) { - return m_OneUse( - m_Intrinsic(m_Value(Vec))); + return m_OneUse(m_Intrinsic(m_Value(Vec))); }; Value *V0, *V1; if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) && @@ -1833,8 +1832,8 @@ // Difference of sums is sum of differences: // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1) Value *Sub = Builder.CreateSub(V0, V1); - Value *Rdx = Builder.CreateIntrinsic( - Intrinsic::experimental_vector_reduce_add, {Sub->getType()}, {Sub}); + Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_add, + {Sub->getType()}, {Sub}); return replaceInstUsesWith(I, Rdx); } @@ -2280,9 +2279,8 @@ } auto m_FaddRdx = [](Value *&Sum, Value *&Vec) { - return m_OneUse( - m_Intrinsic( - m_Value(Sum), m_Value(Vec))); + return m_OneUse(m_Intrinsic(m_Value(Sum), + m_Value(Vec))); }; Value *A0, *A1, *V0, *V1; if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) && @@ -2290,9 +2288,9 @@ // Difference of sums is sum of differences: // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1 Value *Sub = Builder.CreateFSubFMF(V0, V1, &I); - Value *Rdx = Builder.CreateIntrinsic( - Intrinsic::experimental_vector_reduce_v2_fadd, - {A0->getType(), Sub->getType()}, {A0, Sub}, &I); + Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd, + {A0->getType(), Sub->getType()}, + {A0, Sub}, &I); return BinaryOperator::CreateFSubFMF(Rdx, A1, &I); } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3001,7 +3001,7 @@ setOrigin(&I, getOrigin(&I, 0)); } - // Instrument experimental.vector.reduce.or intrinsic. + // Instrument vector.reduce.or intrinsic. // Valid (non-poisoned) set bits in the operand pull low the // corresponding shadow bits. void handleVectorReduceOrIntrinsic(IntrinsicInst &I) { @@ -3019,7 +3019,7 @@ setOrigin(&I, getOrigin(&I, 0)); } - // Instrument experimental.vector.reduce.or intrinsic. + // Instrument vector.reduce.and intrinsic. // Valid (non-poisoned) unset bits in the operand pull down the // corresponding shadow bits. void handleVectorReduceAndIntrinsic(IntrinsicInst &I) { @@ -3254,15 +3254,15 @@ case Intrinsic::masked_load: handleMaskedLoad(I); break; - case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::vector_reduce_and: handleVectorReduceAndIntrinsic(I); break; - case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::vector_reduce_or: handleVectorReduceOrIntrinsic(I); break; - case Intrinsic::experimental_vector_reduce_add: - case Intrinsic::experimental_vector_reduce_xor: - case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_mul: handleVectorReduceIntrinsic(I); break; case Intrinsic::x86_sse_stmxcsr: diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll --- a/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll +++ b/llvm/test/Analysis/CostModel/AArch64/vector-reduce.ll @@ -2,278 +2,278 @@ ; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE ; COST-LABEL: add.i8.v8i8 -; COST: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %v) +; COST: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) ; CODE-LABEL: add.i8.v8i8 ; CODE: addv b0, v0.8b define i8 @add.i8.v8i8(<8 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %v) + %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) ret i8 %r } ; COST-LABEL: add.i8.v16i8 -; COST: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %v) +; COST: Found an estimated cost of 1 for instruction: %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) ; CODE-LABEL: add.i8.v16i8 ; CODE: addv b0, v0.16b define i8 @add.i8.v16i8(<16 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %v) + %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) ret i8 %r } ; COST-LABEL: add.i16.v4i16 -; COST: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %v) +; COST: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) ; CODE-LABEL: add.i16.v4i16 ; CODE: addv h0, v0.4h define i16 @add.i16.v4i16(<4 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %v) + %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) ret i16 %r } ; COST-LABEL: add.i16.v8i16 -; COST: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %v) +; COST: Found an estimated cost of 1 for instruction: %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) ; CODE-LABEL: add.i16.v8i16 ; CODE: addv h0, v0.8h define i16 @add.i16.v8i16(<8 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %v) + %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) ret i16 %r } ; COST-LABEL: add.i32.v4i32 -; COST: Found an estimated cost of 1 for instruction: %r = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v) +; COST: Found an estimated cost of 1 for instruction: %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) ; CODE-LABEL: add.i32.v4i32 ; CODE: addv s0, v0.4s define i32 @add.i32.v4i32(<4 x i32> %v) { - %r = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v) + %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) ret i32 %r } ; COST-LABEL: umin.i8.v8i8 -; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v) ; CODE-LABEL: umin.i8.v8i8 ; CODE: uminv b0, v0.8b define i8 @umin.i8.v8i8(<8 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %v) + %r = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v) ret i8 %r } ; COST-LABEL: umin.i8.v16i8 -; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %v) +; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v) ; CODE-LABEL: umin.i8.v16i8 ; CODE: uminv b0, v0.16b define i8 @umin.i8.v16i8(<16 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %v) + %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v) ret i8 %r } ; COST-LABEL: umin.i16.v4i16 -; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %v) +; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v) ; CODE-LABEL: umin.i16.v4i16 ; CODE: uminv h0, v0.4h define i16 @umin.i16.v4i16(<4 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %v) + %r = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v) ret i16 %r } ; COST-LABEL: umin.i16.v8i16 -; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v) ; CODE-LABEL: umin.i16.v8i16 ; CODE: uminv h0, v0.8h define i16 @umin.i16.v8i16(<8 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %v) + %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v) ret i16 %r } ; COST-LABEL: umin.i32.v4i32 -; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v) ; CODE-LABEL: umin.i32.v4i32 ; CODE: uminv s0, v0.4s define i32 @umin.i32.v4i32(<4 x i32> %v) { - %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %v) + %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v) ret i32 %r } ; COST-LABEL: umax.i8.v8i8 -; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v) ; CODE-LABEL: umax.i8.v8i8 ; CODE: umaxv b0, v0.8b define i8 @umax.i8.v8i8(<8 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %v) + %r = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v) ret i8 %r } ; COST-LABEL: umax.i8.v16i8 -; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %v) +; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v) ; CODE-LABEL: umax.i8.v16i8 ; CODE: umaxv b0, v0.16b define i8 @umax.i8.v16i8(<16 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %v) + %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v) ret i8 %r } ; COST-LABEL: umax.i16.v4i16 -; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %v) +; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v) ; CODE-LABEL: umax.i16.v4i16 ; CODE: umaxv h0, v0.4h define i16 @umax.i16.v4i16(<4 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %v) + %r = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v) ret i16 %r } ; COST-LABEL: umax.i16.v8i16 -; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v) ; CODE-LABEL: umax.i16.v8i16 ; CODE: umaxv h0, v0.8h define i16 @umax.i16.v8i16(<8 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %v) + %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v) ret i16 %r } ; COST-LABEL: umax.i32.v4i32 -; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v) ; CODE-LABEL: umax.i32.v4i32 ; CODE: umaxv s0, v0.4s define i32 @umax.i32.v4i32(<4 x i32> %v) { - %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %v) + %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v) ret i32 %r } ; COST-LABEL: smin.i8.v8i8 -; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v) ; CODE-LABEL: smin.i8.v8i8 ; CODE: sminv b0, v0.8b define i8 @smin.i8.v8i8(<8 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %v) + %r = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v) ret i8 %r } ; COST-LABEL: smin.i8.v16i8 -; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %v) +; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v) ; CODE-LABEL: smin.i8.v16i8 ; CODE: sminv b0, v0.16b define i8 @smin.i8.v16i8(<16 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %v) + %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v) ret i8 %r } ; COST-LABEL: smin.i16.v4i16 -; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %v) +; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v) ; CODE-LABEL: smin.i16.v4i16 ; CODE: sminv h0, v0.4h define i16 @smin.i16.v4i16(<4 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %v) + %r = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v) ret i16 %r } ; COST-LABEL: smin.i16.v8i16 -; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v) ; CODE-LABEL: smin.i16.v8i16 ; CODE: sminv h0, v0.8h define i16 @smin.i16.v8i16(<8 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %v) + %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v) ret i16 %r } ; COST-LABEL: smin.i32.v4i32 -; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v) ; CODE-LABEL: smin.i32.v4i32 ; CODE: sminv s0, v0.4s define i32 @smin.i32.v4i32(<4 x i32> %v) { - %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %v) + %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v) ret i32 %r } ; COST-LABEL: smax.i8.v8i8 -; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v) ; CODE-LABEL: smax.i8.v8i8 ; CODE: smaxv b0, v0.8b define i8 @smax.i8.v8i8(<8 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %v) + %r = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v) ret i8 %r } ; COST-LABEL: smax.i8.v16i8 -; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %v) +; COST: Found an estimated cost of 608 for instruction: %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v) ; CODE-LABEL: smax.i8.v16i8 ; CODE: smaxv b0, v0.16b define i8 @smax.i8.v16i8(<16 x i8> %v) { - %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %v) + %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v) ret i8 %r } ; COST-LABEL: smax.i16.v4i16 -; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %v) +; COST: Found an estimated cost of 64 for instruction: %r = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v) ; CODE-LABEL: smax.i16.v4i16 ; CODE: smaxv h0, v0.4h define i16 @smax.i16.v4i16(<4 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %v) + %r = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v) ret i16 %r } ; COST-LABEL: smax.i16.v8i16 -; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %v) +; COST: Found an estimated cost of 216 for instruction: %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v) ; CODE-LABEL: smax.i16.v8i16 ; CODE: smaxv h0, v0.8h define i16 @smax.i16.v8i16(<8 x i16> %v) { - %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %v) + %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v) ret i16 %r } ; COST-LABEL: smax.i32.v4i32 -; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v) ; CODE-LABEL: smax.i32.v4i32 ; CODE: smaxv s0, v0.4s define i32 @smax.i32.v4i32(<4 x i32> %v) { - %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %v) + %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v) ret i32 %r } ; COST-LABEL: fmin.f32.v4f32 -; COST: Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) ; CODE-LABEL: fmin.f32.v4f32 ; CODE: fminnmv s0, v0.4s define float @fmin.f32.v4f32(<4 x float> %v) { - %r = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %v) + %r = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) ret float %r } ; COST-LABEL: fmax.f32.v4f32 -; COST: Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %v) +; COST: Found an estimated cost of 34 for instruction: %r = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) ; CODE-LABEL: fmax.f32.v4f32 ; CODE: fmaxnmv s0, v0.4s define float @fmax.f32.v4f32(<4 x float> %v) { - %r = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %v) + %r = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) ret float %r } -declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) - -declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) - -declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) - -declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) - -declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) - -declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) - -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) +declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) + +declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) + +declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) + +declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) + +declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) + +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) + +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll --- a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll @@ -8,155 +8,155 @@ define i32 @reduce_i64(i32 %arg) { ; V8M-RECIP-LABEL: 'reduce_i64' -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-RECIP-LABEL: 'reduce_i64' -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-RECIP-LABEL: 'reduce_i64' -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; V8M-SIZE-LABEL: 'reduce_i64' -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-SIZE-LABEL: 'reduce_i64' -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; MVE-SIZE-LABEL: 'reduce_i64' -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; V8M-RECIP-LABEL: 'reduce_i32' -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; V8M-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-RECIP-LABEL: 'reduce_i32' -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; NEON-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-RECIP-LABEL: 'reduce_i32' -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4120 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5658 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 11806 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36390 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 782 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4120 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5658 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 11806 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36390 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; V8M-SIZE-LABEL: 'reduce_i32' -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; V8M-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-SIZE-LABEL: 'reduce_i32' -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; MVE-SIZE-LABEL: 'reduce_i32' -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll --- a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll @@ -5,171 +5,171 @@ define i32 @reduce_i64(i32 %arg) { ; V8M-LABEL: 'reduce_i64' -; V8M-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i64' -; NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i64' -; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; V8M-LABEL: 'reduce_i32' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i32' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; V8M-LABEL: 'reduce_i16' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i16' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; V8M-LABEL: 'reduce_i8' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i8' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll --- a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll @@ -5,171 +5,171 @@ define i32 @reduce_i64(i32 %arg) { ; V8M-LABEL: 'reduce_i64' -; V8M-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i64' -; NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i64' -; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; V8M-LABEL: 'reduce_i32' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i32' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; V8M-LABEL: 'reduce_i16' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i16' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; V8M-LABEL: 'reduce_i8' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i8' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll --- a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll @@ -5,171 +5,171 @@ define i32 @reduce_i64(i32 %arg) { ; V8M-LABEL: 'reduce_i64' -; V8M-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i64' -; NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i64' -; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; V8M-LABEL: 'reduce_i32' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i32' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; V8M-LABEL: 'reduce_i16' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i16' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; V8M-LABEL: 'reduce_i8' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i8' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll --- a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll @@ -5,171 +5,171 @@ define i32 @reduce_i64(i32 %arg) { ; V8M-LABEL: 'reduce_i64' -; V8M-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 167 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i64' -; NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 178 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i64' -; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; V8M-LABEL: 'reduce_i32' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i32' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 632 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 2184 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; V8M-LABEL: 'reduce_i16' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i16' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 303 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 503 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 2720 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 8880 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; V8M-LABEL: 'reduce_i8' -; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; V8M-NEXT: Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 379 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; V8M-NEXT: Cost Model: Found an estimated cost of 763 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; V8M-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; NEON-LABEL: 'reduce_i8' -; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; NEON-NEXT: Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 395 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 493 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 689 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; NEON-NEXT: Cost Model: Found an estimated cost of 1081 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 11820 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 36412 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-add.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-add.ll @@ -12,279 +12,279 @@ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'reduce_i64' -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'reduce_i32' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'reduce_i16' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'reduce_i8' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-and.ll @@ -10,258 +10,258 @@ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i16' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i16' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i8' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> undef) ret i32 undef } define i32 @reduce_i1(i32 %arg) { ; SSE-LABEL: 'reduce_i1' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef) - %V2 = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> undef) - %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) - %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) - %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) - %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) - %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) - %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) + %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) + %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) + %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) + %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) + %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) + %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) + %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) + %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>) -declare i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1>) +declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1>) +declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>) +declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>) +declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>) +declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>) +declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>) +declare i1 @llvm.vector.reduce.and.v128i1(<128 x i1>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-fmax.ll @@ -11,83 +11,83 @@ define i32 @reduce_f64(i32 %arg) { ; SSE-LABEL: 'reduce_f64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_f64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> undef) - %V2 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> undef) - %V4 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> undef) - %V8 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> undef) - %V16 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> undef) + %V1 = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> undef) + %V2 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) + %V4 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) + %V8 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> undef) + %V16 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> undef) ret i32 undef } define i32 @reduce_f32(i32 %arg) { ; SSE-LABEL: 'reduce_f32' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_f32' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> undef) - %V2 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> undef) - %V4 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> undef) - %V8 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> undef) - %V16 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> undef) - %V32 = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> undef) + %V1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> undef) + %V2 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef) + %V4 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) + %V8 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) + %V16 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> undef) + %V32 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> undef) ret i32 undef } -declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) +declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>) +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) +declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) -declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float>) +declare float @llvm.vector.reduce.fmax.v1f32(<1 x float>) +declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) +declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-fmin.ll @@ -11,83 +11,83 @@ define i32 @reduce_f64(i32 %arg) { ; SSE-LABEL: 'reduce_f64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_f64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_f64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> undef) - %V2 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> undef) - %V4 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> undef) - %V8 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> undef) - %V16 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> undef) + %V1 = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> undef) + %V2 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) + %V4 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) + %V8 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> undef) + %V16 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> undef) ret i32 undef } define i32 @reduce_f32(i32 %arg) { ; SSE-LABEL: 'reduce_f32' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_f32' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_f32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> undef) - %V2 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> undef) - %V4 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> undef) - %V8 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> undef) - %V16 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> undef) - %V32 = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> undef) + %V1 = call float @llvm.vector.reduce.fmin.v1f32(<1 x float> undef) + %V2 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef) + %V4 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) + %V8 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) + %V16 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> undef) + %V32 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> undef) ret i32 undef } -declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>) +declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) +declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>) -declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float>) +declare float @llvm.vector.reduce.fmin.v1f32(<1 x float>) +declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>) +declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll @@ -10,276 +10,276 @@ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i64' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i64' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i64' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-or.ll @@ -10,258 +10,258 @@ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i16' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i16' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i8' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> undef) ret i32 undef } define i32 @reduce_i1(i32 %arg) { ; SSE-LABEL: 'reduce_i1' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef) - %V2 = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> undef) - %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) - %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) - %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) - %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) - %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) - %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) + %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) + %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) + %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) + %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) + %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) + %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) + %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) + %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>) -declare i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1>) +declare i1 @llvm.vector.reduce.or.v1i1(<1 x i1>) +declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>) +declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>) +declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>) +declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>) +declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>) +declare i1 @llvm.vector.reduce.or.v128i1(<128 x i1>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-smax.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-smax.ll @@ -11,322 +11,322 @@ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'reduce_i64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i32' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i16' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i8' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-smin.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-smin.ll @@ -11,322 +11,322 @@ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'reduce_i64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i32' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i16' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i8' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-umax.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-umax.ll @@ -11,322 +11,322 @@ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'reduce_i64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i32' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i16' -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i8' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-umin.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-umin.ll @@ -11,322 +11,322 @@ define i32 @reduce_i64(i32 %arg) { ; SSE2-LABEL: 'reduce_i64' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i64' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE41-LABEL: 'reduce_i64' -; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; SSE41-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; SSE41-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; SSE41-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i64' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; SSE2-LABEL: 'reduce_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i32' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i16' -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE4-LABEL: 'reduce_i8' -; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; SSE4-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; SSE4-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll --- a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll @@ -10,280 +10,280 @@ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef) - %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef) - %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef) - %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef) - %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef) + %V1 = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> undef) + %V2 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef) + %V4 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef) + %V8 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> undef) + %V16 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> undef) ret i32 undef } define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef) - %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef) - %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef) - %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef) - %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef) + %V2 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) + %V4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) + %V8 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) + %V16 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> undef) + %V32 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> undef) ret i32 undef } define i32 @reduce_i16(i32 %arg) { ; SSE-LABEL: 'reduce_i16' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i16' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i16' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) - %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) - %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) - %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) - %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) - %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) + %V2 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> undef) + %V4 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) + %V8 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) + %V16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) + %V32 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> undef) + %V64 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> undef) ret i32 undef } define i32 @reduce_i8(i32 %arg) { ; SSE-LABEL: 'reduce_i8' -; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'reduce_i8' -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) - %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) - %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) - %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) - %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) - %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) - %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) + %V2 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> undef) + %V4 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) + %V8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) + %V16 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) + %V32 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) + %V64 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) + %V128 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> undef) ret i32 undef } define i32 @reduce_i1(i32 %arg) { ; SSE2-LABEL: 'reduce_i1' -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSSE3-LABEL: 'reduce_i1' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'reduce_i1' -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 775 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 776 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; - %V1 = call i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1> undef) - %V2 = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> undef) - %V4 = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> undef) - %V8 = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> undef) - %V16 = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> undef) - %V32 = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> undef) - %V64 = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> undef) - %V128 = call i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1> undef) + %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef) + %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef) + %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef) + %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef) + %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef) + %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) + %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) + %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ret i32 undef } -declare i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>) -declare i1 @llvm.experimental.vector.reduce.xor.v1i1(<1 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v128i1(<128 x i1>) +declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1>) +declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1>) +declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1>) +declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1>) +declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1>) +declare i1 @llvm.vector.reduce.xor.v64i1(<64 x i1>) +declare i1 @llvm.vector.reduce.xor.v128i1(<128 x i1>) diff --git a/llvm/test/Assembler/invalid-vecreduce.ll b/llvm/test/Assembler/invalid-vecreduce.ll --- a/llvm/test/Assembler/invalid-vecreduce.ll +++ b/llvm/test/Assembler/invalid-vecreduce.ll @@ -1,34 +1,34 @@ ; RUN: not opt -S < %s 2>&1 | FileCheck %s ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: float (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64 +; CHECK-NEXT: float (double, <2 x double>)* @llvm.vector.reduce.fadd.f32.f64.v2f64 define float @fadd_invalid_scalar_res(double %acc, <2 x double> %in) { - %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) + %res = call float @llvm.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) ret float %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: double (float, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64 +; CHECK-NEXT: double (float, <2 x double>)* @llvm.vector.reduce.fadd.f64.f32.v2f64 define double @fadd_invalid_scalar_start(float %acc, <2 x double> %in) { - %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) + %res = call double @llvm.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) ret double %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64 +; CHECK-NEXT: <2 x double> (double, <2 x double>)* @llvm.vector.reduce.fadd.v2f64.f64.v2f64 define <2 x double> @fadd_invalid_vector_res(double %acc, <2 x double> %in) { - %res = call <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) + %res = call <2 x double> @llvm.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) ret <2 x double> %res } ; CHECK: Intrinsic has incorrect argument type! -; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64 +; CHECK-NEXT: double (<2 x double>, <2 x double>)* @llvm.vector.reduce.fadd.f64.v2f64.v2f64 define double @fadd_invalid_vector_start(<2 x double> %in, <2 x double> %acc) { - %res = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) + %res = call double @llvm.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) ret double %res } -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) -declare <2 x double> @llvm.experimental.vector.reduce.v2.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) +declare float @llvm.vector.reduce.fadd.f32.f64.v2f64(double %acc, <2 x double> %in) +declare double @llvm.vector.reduce.fadd.f64.f32.v2f64(float %acc, <2 x double> %in) +declare double @llvm.vector.reduce.fadd.f64.v2f64.v2f64(<2 x double> %acc, <2 x double> %in) +declare <2 x double> @llvm.vector.reduce.fadd.v2f64.f64.v2f64(double %acc, <2 x double> %in) diff --git a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll --- a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll +++ b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll @@ -1,64 +1,130 @@ ; RUN: opt -S < %s | FileCheck %s ; RUN: llvm-dis < %s.bc | FileCheck %s -define float @fadd_acc(<4 x float> %in, float %acc) { -; CHECK-LABEL: @fadd_acc -; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in) - %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) + +define float @fadd_v2(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fadd_v2 +; CHECK: %res = call float @llvm.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in) ret float %res } -define float @fadd_undef(<4 x float> %in) { -; CHECK-LABEL: @fadd_undef -; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %in) - %res = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in) +define float @fadd_v2_fast(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fadd_v2_fast +; CHECK: %res = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %acc, <4 x float> %in) ret float %res } -define float @fadd_fast_acc(<4 x float> %in, float %acc) { -; CHECK-LABEL: @fadd_fast_acc -; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in) - %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %in) +define float @fmul_v2(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fmul_v2 +; CHECK: %res = call float @llvm.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in) ret float %res } -define float @fadd_fast_undef(<4 x float> %in) { -; CHECK-LABEL: @fadd_fast_undef -; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %in) - %res = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %in) +define float @fmul_v2_fast(<4 x float> %in, float %acc) { +; CHECK-LABEL: @fmul_v2_fast +; CHECK: %res = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) + %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in) ret float %res } -define float @fmul_acc(<4 x float> %in, float %acc) { -; CHECK-LABEL: @fmul_acc -; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %acc, <4 x float> %in) - %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) +define float @fmin(<4 x float> %in) { +; CHECK-LABEL: @fmin +; CHECK: %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %in) ret float %res } -define float @fmul_undef(<4 x float> %in) { -; CHECK-LABEL: @fmul_undef -; CHECK: %res = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %in) - %res = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in) +define float @fmax(<4 x float> %in) { +; CHECK-LABEL: @fmax +; CHECK: %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %in) + %res = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %in) ret float %res } -define float @fmul_fast_acc(<4 x float> %in, float %acc) { -; CHECK-LABEL: @fmul_fast_acc -; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in) - %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %in) - ret float %res +define i32 @and(<4 x i32> %in) { +; CHECK-LABEL: @and +; CHECK: %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %in) + %res = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %in) + ret i32 %res } -define float @fmul_fast_undef(<4 x float> %in) { -; CHECK-LABEL: @fmul_fast_undef -; CHECK: %res = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %in) - %res = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %in) - ret float %res +define i32 @or(<4 x i32> %in) { +; CHECK-LABEL: @or +; CHECK: %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %in) + %res = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %in) + ret i32 %res +} + +define i32 @xor(<4 x i32> %in) { +; CHECK-LABEL: @xor +; CHECK: %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %in) + %res = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %in) + ret i32 %res +} + +define i32 @smin(<4 x i32> %in) { +; CHECK-LABEL: @smin +; CHECK: %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %in) + %res = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %in) + ret i32 %res } -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) -; CHECK: declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) +define i32 @smax(<4 x i32> %in) { +; CHECK-LABEL: @smax +; CHECK: %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %in) + %res = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %in) + ret i32 %res +} + +define i32 @umin(<4 x i32> %in) { +; CHECK-LABEL: @umin +; CHECK: %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %in) + %res = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %in) + ret i32 %res +} + +define i32 @umax(<4 x i32> %in) { +; CHECK-LABEL: @umax +; CHECK: %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %in) + %res = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %in) + ret i32 %res +} + + +declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) + +declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) +; CHECK: declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) + +declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) +; CHECK: declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) + +declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>) +; CHECK: declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) + +declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>) +; CHECK: declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) + +declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>) +; CHECK: declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) + +declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) +; CHECK: declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) + +declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) +; CHECK: declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) + +declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) +; CHECK: declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) + +declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) +; CHECK: declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) + + + + -declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) -; CHECK: declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) diff --git a/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc b/llvm/test/Bitcode/upgrade-vecreduce-intrinsics.ll.bc index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) define i8 @add_B(<16 x i8>* %arr) { ; CHECK-LABEL: add_B ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b %bin.rdx = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %bin.rdx) + %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %bin.rdx) ret i8 %r } @@ -18,7 +18,7 @@ ; CHECK-LABEL: add_H ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h %bin.rdx = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %bin.rdx) + %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx) ret i16 %r } @@ -26,7 +26,7 @@ ; CHECK-LABEL: add_S ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s %bin.rdx = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %bin.rdx) + %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %bin.rdx) ret i32 %r } @@ -35,11 +35,11 @@ ; CHECK-NOT: addv ; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d %bin.rdx = load <2 x i64>, <2 x i64>* %arr - %r = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %bin.rdx) + %r = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %bin.rdx) ret i64 %r } -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) { ; CHECK-LABEL: oversized_ADDV_256 @@ -55,16 +55,16 @@ %7 = icmp slt <8 x i32> %6, zeroinitializer %8 = sub nsw <8 x i32> zeroinitializer, %6 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 - %r = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %9) + %r = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %9) ret i32 %r } -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) define i32 @oversized_ADDV_512(<16 x i32>* %arr) { ; CHECK-LABEL: oversized_ADDV_512 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s %bin.rdx = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %bin.rdx) + %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx) ret i32 %r } diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll @@ -2,28 +2,28 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" -declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) -declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) - -declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) -declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) - -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) +declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) +declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) + +declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) +declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) + +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) ; CHECK-LABEL: smax_B ; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @smax_B(<16 x i8>* nocapture readonly %arr) { %arr.load = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %arr.load) + %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %arr.load) ret i8 %r } @@ -31,7 +31,7 @@ ; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @smax_H(<8 x i16>* nocapture readonly %arr) { %arr.load = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %arr.load) + %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %arr.load) ret i16 %r } @@ -39,7 +39,7 @@ ; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @smax_S(<4 x i32> * nocapture readonly %arr) { %arr.load = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %arr.load) + %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %arr.load) ret i32 %r } @@ -47,7 +47,7 @@ ; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @umax_B(<16 x i8>* nocapture readonly %arr) { %arr.load = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %arr.load) + %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %arr.load) ret i8 %r } @@ -55,7 +55,7 @@ ; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @umax_H(<8 x i16>* nocapture readonly %arr) { %arr.load = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %arr.load) + %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %arr.load) ret i16 %r } @@ -63,7 +63,7 @@ ; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @umax_S(<4 x i32>* nocapture readonly %arr) { %arr.load = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %arr.load) + %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %arr.load) ret i32 %r } @@ -71,7 +71,7 @@ ; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @smin_B(<16 x i8>* nocapture readonly %arr) { %arr.load = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %arr.load) + %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %arr.load) ret i8 %r } @@ -79,7 +79,7 @@ ; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @smin_H(<8 x i16>* nocapture readonly %arr) { %arr.load = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %arr.load) + %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %arr.load) ret i16 %r } @@ -87,7 +87,7 @@ ; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @smin_S(<4 x i32>* nocapture readonly %arr) { %arr.load = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %arr.load) + %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %arr.load) ret i32 %r } @@ -95,7 +95,7 @@ ; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @umin_B(<16 x i8>* nocapture readonly %arr) { %arr.load = load <16 x i8>, <16 x i8>* %arr - %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %arr.load) + %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %arr.load) ret i8 %r } @@ -103,7 +103,7 @@ ; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @umin_H(<8 x i16>* nocapture readonly %arr) { %arr.load = load <8 x i16>, <8 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %arr.load) + %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %arr.load) ret i16 %r } @@ -111,7 +111,7 @@ ; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @umin_S(<4 x i32>* nocapture readonly %arr) { %arr.load = load <4 x i32>, <4 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %arr.load) + %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %arr.load) ret i32 %r } @@ -119,7 +119,7 @@ ; CHECK: fmaxnmv define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) { %arr.load = load <4 x float>, <4 x float>* %arr - %r = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %arr.load) + %r = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %arr.load) ret float %r } @@ -127,22 +127,22 @@ ; CHECK: fminnmv define float @fminnm_S(<4 x float>* nocapture readonly %arr) { %arr.load = load <4 x float>, <4 x float>* %arr - %r = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %arr.load) + %r = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %arr.load) ret float %r } -declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umax_256 ; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: umaxv {{h[0-9]+}}, [[V0]] %arr.load = load <16 x i16>, <16 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %arr.load) + %r = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %arr.load) ret i16 %r } -declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umax_512 @@ -151,22 +151,22 @@ ; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %arr.load) + %r = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %arr.load) ret i32 %r } -declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umin_256 ; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: uminv {{h[0-9]+}}, [[V0]] %arr.load = load <16 x i16>, <16 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %arr.load) + %r = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %arr.load) ret i16 %r } -declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umin_512 @@ -175,22 +175,22 @@ ; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %arr.load) + %r = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %arr.load) ret i32 %r } -declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smax_256 ; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: smaxv {{h[0-9]+}}, [[V0]] %arr.load = load <16 x i16>, <16 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %arr.load) + %r = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %arr.load) ret i16 %r } -declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smax_512 @@ -199,22 +199,22 @@ ; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %arr.load) + %r = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %arr.load) ret i32 %r } -declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smin_256 ; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: sminv {{h[0-9]+}}, [[V0]] %arr.load = load <16 x i16>, <16 x i16>* %arr - %r = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %arr.load) + %r = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %arr.load) ret i16 %r } -declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smin_512 @@ -223,6 +223,6 @@ ; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %r = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %arr.load) + %r = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %arr.load) ret i32 %r } diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -141,7 +141,7 @@ ret <2 x i64> %tmp4 } -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) { ; CHECK-LABEL: uabdl8h_rdx @@ -155,11 +155,11 @@ %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff - %reduced_v = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %absel) + %reduced_v = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %absel) ret i16 %reduced_v } -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) { ; CHECK-LABEL: uabdl4s_rdx @@ -173,11 +173,11 @@ %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff - %reduced_v = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %absel) + %reduced_v = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %absel) ret i32 %reduced_v } -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { ; CHECK: uabdl2d_rdx @@ -191,7 +191,7 @@ %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff - %reduced_v = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %absel) + %reduced_v = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %absel) ret i64 %reduced_v } diff --git a/llvm/test/CodeGen/AArch64/neon-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-dot-product.ll --- a/llvm/test/CodeGen/AArch64/neon-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-dot-product.ll @@ -205,7 +205,7 @@ ret void } -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) define i32 @test_udot_v8i8(i8* nocapture readonly %a, i8* nocapture readonly %b) { entry: @@ -218,7 +218,7 @@ %4 = load <8 x i8>, <8 x i8>* %3 %5 = zext <8 x i8> %4 to <8 x i32> %6 = mul nuw nsw <8 x i32> %5, %2 - %7 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %6) + %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6) ret i32 %7 } @@ -233,11 +233,11 @@ %4 = load <8 x i8>, <8 x i8>* %3 %5 = sext <8 x i8> %4 to <8 x i32> %6 = mul nsw <8 x i32> %5, %2 - %7 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %6) + %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6) ret i32 %7 } -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) define i32 @test_udot_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %sum) { entry: @@ -250,7 +250,7 @@ %4 = load <16 x i8>, <16 x i8>* %3 %5 = zext <16 x i8> %4 to <16 x i32> %6 = mul nuw nsw <16 x i32> %5, %2 - %7 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %6) + %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6) %op.extra = add i32 %7, %sum ret i32 %op.extra } @@ -265,7 +265,7 @@ %0 = bitcast i8* %a1 to <16 x i8>* %1 = load <16 x i8>, <16 x i8>* %0 %2 = zext <16 x i8> %1 to <16 x i32> - %3 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %2) + %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) ret i32 %3 } @@ -280,7 +280,7 @@ %4 = load <16 x i8>, <16 x i8>* %3 %5 = sext <16 x i8> %4 to <16 x i32> %6 = mul nsw <16 x i32> %5, %2 - %7 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %6) + %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6) %op.extra = add nsw i32 %7, %sum ret i32 %op.extra } @@ -295,6 +295,6 @@ %0 = bitcast i8* %a1 to <16 x i8>* %1 = load <16 x i8>, <16 x i8>* %0 %2 = sext <16 x i8> %1 to <16 x i32> - %3 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %2) + %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) ret i32 %3 } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll @@ -29,7 +29,7 @@ ; CHECK-LABEL: fmaxv_v4f16: ; CHECK: fmaxnmv h0, v0.4h ; CHECK-NEXT: ret - %res = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %a) + %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a) ret half %res } @@ -38,7 +38,7 @@ ; CHECK-LABEL: fmaxv_v8f16: ; CHECK: fmaxnmv h0, v0.8h ; CHECK-NEXT: ret - %res = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %a) + %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a) ret half %res } @@ -49,7 +49,7 @@ ; VBITS_GE_256-NEXT: fmaxnmv h0, [[PG]], [[OP]].h ; VBITS_GE_256-NEXT: ret %op = load <16 x half>, <16 x half>* %a - %res = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %op) + %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op) ret half %res } @@ -60,7 +60,7 @@ ; VBITS_GE_512-NEXT: fmaxnmv h0, [[PG]], [[OP]].h ; VBITS_GE_512-NEXT: ret %op = load <32 x half>, <32 x half>* %a - %res = call half @llvm.experimental.vector.reduce.fmax.v32f16(<32 x half> %op) + %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op) ret half %res } @@ -71,7 +71,7 @@ ; VBITS_GE_1024-NEXT: fmaxnmv h0, [[PG]], [[OP]].h ; VBITS_GE_1024-NEXT: ret %op = load <64 x half>, <64 x half>* %a - %res = call half @llvm.experimental.vector.reduce.fmax.v64f16(<64 x half> %op) + %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op) ret half %res } @@ -82,7 +82,7 @@ ; VBITS_GE_2048-NEXT: fmaxnmv h0, [[PG]], [[OP]].h ; VBITS_GE_2048-NEXT: ret %op = load <128 x half>, <128 x half>* %a - %res = call half @llvm.experimental.vector.reduce.fmax.v128f16(<128 x half> %op) + %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op) ret half %res } @@ -91,7 +91,7 @@ ; CHECK-LABEL: fmaxv_v2f32: ; CHECK: fmaxnmp s0, v0.2s ; CHECK: ret - %res = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a) + %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a) ret float %res } @@ -100,7 +100,7 @@ ; CHECK-LABEL: fmaxv_v4f32: ; CHECK: fmaxnmv s0, v0.4s ; CHECK: ret - %res = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a) + %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) ret float %res } @@ -111,7 +111,7 @@ ; VBITS_GE_256-NEXT: fmaxnmv s0, [[PG]], [[OP]].s ; VBITS_GE_256-NEXT: ret %op = load <8 x float>, <8 x float>* %a - %res = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %op) + %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op) ret float %res } @@ -122,7 +122,7 @@ ; VBITS_GE_512-NEXT: fmaxnmv s0, [[PG]], [[OP]].s ; VBITS_GE_512-NEXT: ret %op = load <16 x float>, <16 x float>* %a - %res = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %op) + %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op) ret float %res } @@ -133,7 +133,7 @@ ; VBITS_GE_1024-NEXT: fmaxnmv s0, [[PG]], [[OP]].s ; VBITS_GE_1024-NEXT: ret %op = load <32 x float>, <32 x float>* %a - %res = call float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float> %op) + %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op) ret float %res } @@ -144,7 +144,7 @@ ; VBITS_GE_2048-NEXT: fmaxnmv s0, [[PG]], [[OP]].s ; VBITS_GE_2048-NEXT: ret %op = load <64 x float>, <64 x float>* %a - %res = call float @llvm.experimental.vector.reduce.fmax.v64f32(<64 x float> %op) + %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op) ret float %res } @@ -153,7 +153,7 @@ ; CHECK-LABEL: fmaxv_v1f64: ; CHECK-NOT: fmax ; CHECK: ret - %res = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a) + %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a) ret double %res } @@ -162,7 +162,7 @@ ; CHECK-LABEL: fmaxv_v2f64: ; CHECK: fmaxnmp d0, v0.2d ; CHECK-NEXT: ret - %res = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a) + %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) ret double %res } @@ -173,7 +173,7 @@ ; VBITS_GE_256-NEXT: fmaxnmv d0, [[PG]], [[OP]].d ; VBITS_GE_256-NEXT: ret %op = load <4 x double>, <4 x double>* %a - %res = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %op) + %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op) ret double %res } @@ -184,7 +184,7 @@ ; VBITS_GE_512-NEXT: fmaxnmv d0, [[PG]], [[OP]].d ; VBITS_GE_512-NEXT: ret %op = load <8 x double>, <8 x double>* %a - %res = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %op) + %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op) ret double %res } @@ -195,7 +195,7 @@ ; VBITS_GE_1024-NEXT: fmaxnmv d0, [[PG]], [[OP]].d ; VBITS_GE_1024-NEXT: ret %op = load <16 x double>, <16 x double>* %a - %res = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %op) + %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op) ret double %res } @@ -206,7 +206,7 @@ ; VBITS_GE_2048-NEXT: fmaxnmv d0, [[PG]], [[OP]].d ; VBITS_GE_2048-NEXT: ret %op = load <32 x double>, <32 x double>* %a - %res = call double @llvm.experimental.vector.reduce.fmax.v32f64(<32 x double> %op) + %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op) ret double %res } @@ -219,7 +219,7 @@ ; CHECK-LABEL: fminv_v4f16: ; CHECK: fminnmv h0, v0.4h ; CHECK-NEXT: ret - %res = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %a) + %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a) ret half %res } @@ -228,7 +228,7 @@ ; CHECK-LABEL: fminv_v8f16: ; CHECK: fminnmv h0, v0.8h ; CHECK-NEXT: ret - %res = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %a) + %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a) ret half %res } @@ -239,7 +239,7 @@ ; VBITS_GE_256-NEXT: fminnmv h0, [[PG]], [[OP]].h ; VBITS_GE_256-NEXT: ret %op = load <16 x half>, <16 x half>* %a - %res = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %op) + %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op) ret half %res } @@ -250,7 +250,7 @@ ; VBITS_GE_512-NEXT: fminnmv h0, [[PG]], [[OP]].h ; VBITS_GE_512-NEXT: ret %op = load <32 x half>, <32 x half>* %a - %res = call half @llvm.experimental.vector.reduce.fmin.v32f16(<32 x half> %op) + %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op) ret half %res } @@ -261,7 +261,7 @@ ; VBITS_GE_1024-NEXT: fminnmv h0, [[PG]], [[OP]].h ; VBITS_GE_1024-NEXT: ret %op = load <64 x half>, <64 x half>* %a - %res = call half @llvm.experimental.vector.reduce.fmin.v64f16(<64 x half> %op) + %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op) ret half %res } @@ -272,7 +272,7 @@ ; VBITS_GE_2048-NEXT: fminnmv h0, [[PG]], [[OP]].h ; VBITS_GE_2048-NEXT: ret %op = load <128 x half>, <128 x half>* %a - %res = call half @llvm.experimental.vector.reduce.fmin.v128f16(<128 x half> %op) + %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op) ret half %res } @@ -281,7 +281,7 @@ ; CHECK-LABEL: fminv_v2f32: ; CHECK: fminnmp s0, v0.2s ; CHECK: ret - %res = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a) + %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a) ret float %res } @@ -290,7 +290,7 @@ ; CHECK-LABEL: fminv_v4f32: ; CHECK: fminnmv s0, v0.4s ; CHECK: ret - %res = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a) + %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) ret float %res } @@ -301,7 +301,7 @@ ; VBITS_GE_256-NEXT: fminnmv s0, [[PG]], [[OP]].s ; VBITS_GE_256-NEXT: ret %op = load <8 x float>, <8 x float>* %a - %res = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %op) + %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op) ret float %res } @@ -312,7 +312,7 @@ ; VBITS_GE_512-NEXT: fminnmv s0, [[PG]], [[OP]].s ; VBITS_GE_512-NEXT: ret %op = load <16 x float>, <16 x float>* %a - %res = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %op) + %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op) ret float %res } @@ -323,7 +323,7 @@ ; VBITS_GE_1024-NEXT: fminnmv s0, [[PG]], [[OP]].s ; VBITS_GE_1024-NEXT: ret %op = load <32 x float>, <32 x float>* %a - %res = call float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float> %op) + %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op) ret float %res } @@ -334,7 +334,7 @@ ; VBITS_GE_2048-NEXT: fminnmv s0, [[PG]], [[OP]].s ; VBITS_GE_2048-NEXT: ret %op = load <64 x float>, <64 x float>* %a - %res = call float @llvm.experimental.vector.reduce.fmin.v64f32(<64 x float> %op) + %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op) ret float %res } @@ -343,7 +343,7 @@ ; CHECK-LABEL: fminv_v1f64: ; CHECK-NOT: fmin ; CHECK: ret - %res = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a) + %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a) ret double %res } @@ -352,7 +352,7 @@ ; CHECK-LABEL: fminv_v2f64: ; CHECK: fminnmp d0, v0.2d ; CHECK-NEXT: ret - %res = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a) + %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) ret double %res } @@ -363,7 +363,7 @@ ; VBITS_GE_256-NEXT: fminnmv d0, [[PG]], [[OP]].d ; VBITS_GE_256-NEXT: ret %op = load <4 x double>, <4 x double>* %a - %res = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %op) + %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op) ret double %res } @@ -374,7 +374,7 @@ ; VBITS_GE_512-NEXT: fminnmv d0, [[PG]], [[OP]].d ; VBITS_GE_512-NEXT: ret %op = load <8 x double>, <8 x double>* %a - %res = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %op) + %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op) ret double %res } @@ -385,7 +385,7 @@ ; VBITS_GE_1024-NEXT: fminnmv d0, [[PG]], [[OP]].d ; VBITS_GE_1024-NEXT: ret %op = load <16 x double>, <16 x double>* %a - %res = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %op) + %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op) ret double %res } @@ -396,50 +396,50 @@ ; VBITS_GE_2048-NEXT: fminnmv d0, [[PG]], [[OP]].d ; VBITS_GE_2048-NEXT: ret %op = load <32 x double>, <32 x double>* %a - %res = call double @llvm.experimental.vector.reduce.fmin.v32f64(<32 x double> %op) + %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op) ret double %res } attributes #0 = { "target-features"="+sve" } -declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>) -declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>) -declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>) -declare half @llvm.experimental.vector.reduce.fmax.v32f16(<32 x half>) -declare half @llvm.experimental.vector.reduce.fmax.v64f16(<64 x half>) -declare half @llvm.experimental.vector.reduce.fmax.v128f16(<128 x half>) - -declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v32f32(<32 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v64f32(<64 x float>) - -declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v32f64(<32 x double>) - -declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>) -declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>) -declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>) -declare half @llvm.experimental.vector.reduce.fmin.v32f16(<32 x half>) -declare half @llvm.experimental.vector.reduce.fmin.v64f16(<64 x half>) -declare half @llvm.experimental.vector.reduce.fmin.v128f16(<128 x half>) - -declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v32f32(<32 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v64f32(<64 x float>) - -declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v32f64(<32 x double>) +declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>) +declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>) +declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>) +declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>) +declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>) +declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>) + +declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) +declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>) +declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>) + +declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>) +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) +declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) +declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>) + +declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>) +declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>) +declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>) +declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>) +declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>) +declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>) + +declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>) +declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>) +declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>) + +declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) +declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>) +declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll @@ -29,7 +29,7 @@ ; CHECK-LABEL: uaddv_v8i8: ; CHECK: addv b0, v0.8b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a) + %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) ret i8 %res } @@ -38,7 +38,7 @@ ; CHECK-LABEL: uaddv_v16i8: ; CHECK: addv b0, v0.16b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a) + %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) ret i8 %res } @@ -50,7 +50,7 @@ ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %op) + %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op) ret i8 %res } @@ -62,7 +62,7 @@ ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <64 x i8>, <64 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %op) + %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op) ret i8 %res } @@ -74,7 +74,7 @@ ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <128 x i8>, <128 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %op) + %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op) ret i8 %res } @@ -86,7 +86,7 @@ ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <256 x i8>, <256 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.add.v256i8(<256 x i8> %op) + %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op) ret i8 %res } @@ -95,7 +95,7 @@ ; CHECK-LABEL: uaddv_v4i16: ; CHECK: addv h0, v0.4h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a) + %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) ret i16 %res } @@ -104,7 +104,7 @@ ; CHECK-LABEL: uaddv_v8i16: ; CHECK: addv h0, v0.8h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a) + %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a) ret i16 %res } @@ -116,7 +116,7 @@ ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %op) + %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op) ret i16 %res } @@ -128,7 +128,7 @@ ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <32 x i16>, <32 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %op) + %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op) ret i16 %res } @@ -140,7 +140,7 @@ ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <64 x i16>, <64 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %op) + %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op) ret i16 %res } @@ -152,7 +152,7 @@ ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <128 x i16>, <128 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.add.v128i16(<128 x i16> %op) + %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op) ret i16 %res } @@ -161,7 +161,7 @@ ; CHECK-LABEL: uaddv_v2i32: ; CHECK: addp v0.2s, v0.2s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a) + %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) ret i32 %res } @@ -170,7 +170,7 @@ ; CHECK-LABEL: uaddv_v4i32: ; CHECK: addv s0, v0.4s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a) + %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) ret i32 %res } @@ -182,7 +182,7 @@ ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %op) + %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op) ret i32 %res } @@ -194,7 +194,7 @@ ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <16 x i32>, <16 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %op) + %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op) ret i32 %res } @@ -206,7 +206,7 @@ ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <32 x i32>, <32 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %op) + %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op) ret i32 %res } @@ -218,7 +218,7 @@ ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <64 x i32>, <64 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.add.v64i32(<64 x i32> %op) + %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op) ret i32 %res } @@ -227,7 +227,7 @@ ; CHECK-LABEL: uaddv_v1i64: ; CHECK: fmov x0, d0 ; CHECK: ret - %res = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> %a) + %res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a) ret i64 %res } @@ -236,7 +236,7 @@ ; CHECK-LABEL: uaddv_v2i64: ; CHECK: addp d0, v0.2d ; CHECK: ret - %res = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a) + %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) ret i64 %res } @@ -248,7 +248,7 @@ ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <4 x i64>, <4 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %op) + %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op) ret i64 %res } @@ -260,7 +260,7 @@ ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <8 x i64>, <8 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %op) + %res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op) ret i64 %res } @@ -272,7 +272,7 @@ ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <16 x i64>, <16 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %op) + %res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op) ret i64 %res } @@ -284,7 +284,7 @@ ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <32 x i64>, <32 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.add.v32i64(<32 x i64> %op) + %res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op) ret i64 %res } @@ -297,7 +297,7 @@ ; CHECK-LABEL: smaxv_v8i8: ; CHECK: smaxv b0, v0.8b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %a) + %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a) ret i8 %res } @@ -306,7 +306,7 @@ ; CHECK-LABEL: smaxv_v16i8: ; CHECK: smaxv b0, v0.16b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %a) + %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a) ret i8 %res } @@ -318,7 +318,7 @@ ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> %op) + %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op) ret i8 %res } @@ -330,7 +330,7 @@ ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <64 x i8>, <64 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> %op) + %res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op) ret i8 %res } @@ -342,7 +342,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <128 x i8>, <128 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> %op) + %res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op) ret i8 %res } @@ -354,7 +354,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <256 x i8>, <256 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.smax.v256i8(<256 x i8> %op) + %res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op) ret i8 %res } @@ -363,7 +363,7 @@ ; CHECK-LABEL: smaxv_v4i16: ; CHECK: smaxv h0, v0.4h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %a) + %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a) ret i16 %res } @@ -372,7 +372,7 @@ ; CHECK-LABEL: smaxv_v8i16: ; CHECK: smaxv h0, v0.8h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %a) + %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a) ret i16 %res } @@ -384,7 +384,7 @@ ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %op) + %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op) ret i16 %res } @@ -396,7 +396,7 @@ ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <32 x i16>, <32 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> %op) + %res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op) ret i16 %res } @@ -408,7 +408,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <64 x i16>, <64 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> %op) + %res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op) ret i16 %res } @@ -420,7 +420,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <128 x i16>, <128 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.smax.v128i16(<128 x i16> %op) + %res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op) ret i16 %res } @@ -429,7 +429,7 @@ ; CHECK-LABEL: smaxv_v2i32: ; CHECK: smaxp v0.2s, v0.2s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a) + %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a) ret i32 %res } @@ -438,7 +438,7 @@ ; CHECK-LABEL: smaxv_v4i32: ; CHECK: smaxv s0, v0.4s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a) + %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) ret i32 %res } @@ -450,7 +450,7 @@ ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %op) + %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op) ret i32 %res } @@ -462,7 +462,7 @@ ; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <16 x i32>, <16 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %op) + %res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op) ret i32 %res } @@ -474,7 +474,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <32 x i32>, <32 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> %op) + %res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op) ret i32 %res } @@ -486,7 +486,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <64 x i32>, <64 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.smax.v64i32(<64 x i32> %op) + %res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op) ret i32 %res } @@ -495,7 +495,7 @@ ; CHECK-LABEL: smaxv_v1i64: ; CHECK: fmov x0, d0 ; CHECK: ret - %res = call i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64> %a) + %res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a) ret i64 %res } @@ -506,7 +506,7 @@ ; CHECK-NEXT: smaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d ; CHECK-NEXT: fmov x0, [[REDUCE]] ; CHECK-NEXT: ret - %res = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %a) + %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a) ret i64 %res } @@ -518,7 +518,7 @@ ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <4 x i64>, <4 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> %op) + %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op) ret i64 %res } @@ -530,7 +530,7 @@ ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <8 x i64>, <8 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %op) + %res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op) ret i64 %res } @@ -542,7 +542,7 @@ ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <16 x i64>, <16 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> %op) + %res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op) ret i64 %res } @@ -554,7 +554,7 @@ ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <32 x i64>, <32 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.smax.v32i64(<32 x i64> %op) + %res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op) ret i64 %res } @@ -567,7 +567,7 @@ ; CHECK-LABEL: sminv_v8i8: ; CHECK: sminv b0, v0.8b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %a) + %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a) ret i8 %res } @@ -576,7 +576,7 @@ ; CHECK-LABEL: sminv_v16i8: ; CHECK: sminv b0, v0.16b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %a) + %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a) ret i8 %res } @@ -588,7 +588,7 @@ ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> %op) + %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op) ret i8 %res } @@ -600,7 +600,7 @@ ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <64 x i8>, <64 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> %op) + %res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op) ret i8 %res } @@ -612,7 +612,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <128 x i8>, <128 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> %op) + %res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op) ret i8 %res } @@ -624,7 +624,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <256 x i8>, <256 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.smin.v256i8(<256 x i8> %op) + %res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op) ret i8 %res } @@ -633,7 +633,7 @@ ; CHECK-LABEL: sminv_v4i16: ; CHECK: sminv h0, v0.4h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %a) + %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a) ret i16 %res } @@ -642,7 +642,7 @@ ; CHECK-LABEL: sminv_v8i16: ; CHECK: sminv h0, v0.8h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %a) + %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a) ret i16 %res } @@ -654,7 +654,7 @@ ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %op) + %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op) ret i16 %res } @@ -666,7 +666,7 @@ ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <32 x i16>, <32 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> %op) + %res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op) ret i16 %res } @@ -678,7 +678,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <64 x i16>, <64 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> %op) + %res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op) ret i16 %res } @@ -690,7 +690,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <128 x i16>, <128 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.smin.v128i16(<128 x i16> %op) + %res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op) ret i16 %res } @@ -699,7 +699,7 @@ ; CHECK-LABEL: sminv_v2i32: ; CHECK: minp v0.2s, v0.2s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a) + %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a) ret i32 %res } @@ -708,7 +708,7 @@ ; CHECK-LABEL: sminv_v4i32: ; CHECK: sminv s0, v0.4s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a) + %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) ret i32 %res } @@ -720,7 +720,7 @@ ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %op) + %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op) ret i32 %res } @@ -732,7 +732,7 @@ ; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <16 x i32>, <16 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %op) + %res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op) ret i32 %res } @@ -744,7 +744,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <32 x i32>, <32 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> %op) + %res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op) ret i32 %res } @@ -756,7 +756,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <64 x i32>, <64 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.smin.v64i32(<64 x i32> %op) + %res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op) ret i32 %res } @@ -765,7 +765,7 @@ ; CHECK-LABEL: sminv_v1i64: ; CHECK: fmov x0, d0 ; CHECK: ret - %res = call i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64> %a) + %res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a) ret i64 %res } @@ -776,7 +776,7 @@ ; CHECK-NEXT: sminv [[REDUCE:d[0-9]+]], [[PG]], z0.d ; CHECK-NEXT: fmov x0, [[REDUCE]] ; CHECK-NEXT: ret - %res = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %a) + %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a) ret i64 %res } @@ -788,7 +788,7 @@ ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <4 x i64>, <4 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> %op) + %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op) ret i64 %res } @@ -800,7 +800,7 @@ ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <8 x i64>, <8 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %op) + %res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op) ret i64 %res } @@ -812,7 +812,7 @@ ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <16 x i64>, <16 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> %op) + %res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op) ret i64 %res } @@ -824,7 +824,7 @@ ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <32 x i64>, <32 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.smin.v32i64(<32 x i64> %op) + %res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op) ret i64 %res } @@ -837,7 +837,7 @@ ; CHECK-LABEL: umaxv_v8i8: ; CHECK: umaxv b0, v0.8b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %a) + %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a) ret i8 %res } @@ -846,7 +846,7 @@ ; CHECK-LABEL: umaxv_v16i8: ; CHECK: umaxv b0, v0.16b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %a) + %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a) ret i8 %res } @@ -858,7 +858,7 @@ ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> %op) + %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op) ret i8 %res } @@ -870,7 +870,7 @@ ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <64 x i8>, <64 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> %op) + %res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op) ret i8 %res } @@ -882,7 +882,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <128 x i8>, <128 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> %op) + %res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op) ret i8 %res } @@ -894,7 +894,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <256 x i8>, <256 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.umax.v256i8(<256 x i8> %op) + %res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op) ret i8 %res } @@ -903,7 +903,7 @@ ; CHECK-LABEL: umaxv_v4i16: ; CHECK: umaxv h0, v0.4h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %a) + %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a) ret i16 %res } @@ -912,7 +912,7 @@ ; CHECK-LABEL: umaxv_v8i16: ; CHECK: umaxv h0, v0.8h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %a) + %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a) ret i16 %res } @@ -924,7 +924,7 @@ ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %op) + %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op) ret i16 %res } @@ -936,7 +936,7 @@ ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <32 x i16>, <32 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> %op) + %res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op) ret i16 %res } @@ -948,7 +948,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <64 x i16>, <64 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> %op) + %res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op) ret i16 %res } @@ -960,7 +960,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <128 x i16>, <128 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.umax.v128i16(<128 x i16> %op) + %res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op) ret i16 %res } @@ -969,7 +969,7 @@ ; CHECK-LABEL: umaxv_v2i32: ; CHECK: umaxp v0.2s, v0.2s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a) + %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a) ret i32 %res } @@ -978,7 +978,7 @@ ; CHECK-LABEL: umaxv_v4i32: ; CHECK: umaxv s0, v0.4s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a) + %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) ret i32 %res } @@ -990,7 +990,7 @@ ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %op) + %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op) ret i32 %res } @@ -1002,7 +1002,7 @@ ; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <16 x i32>, <16 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %op) + %res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op) ret i32 %res } @@ -1014,7 +1014,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <32 x i32>, <32 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> %op) + %res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op) ret i32 %res } @@ -1026,7 +1026,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <64 x i32>, <64 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.umax.v64i32(<64 x i32> %op) + %res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op) ret i32 %res } @@ -1035,7 +1035,7 @@ ; CHECK-LABEL: umaxv_v1i64: ; CHECK: fmov x0, d0 ; CHECK: ret - %res = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> %a) + %res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a) ret i64 %res } @@ -1046,7 +1046,7 @@ ; CHECK-NEXT: umaxv [[REDUCE:d[0-9]+]], [[PG]], z0.d ; CHECK-NEXT: fmov x0, [[REDUCE]] ; CHECK-NEXT: ret - %res = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a) + %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a) ret i64 %res } @@ -1058,7 +1058,7 @@ ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <4 x i64>, <4 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> %op) + %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op) ret i64 %res } @@ -1070,7 +1070,7 @@ ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <8 x i64>, <8 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %op) + %res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op) ret i64 %res } @@ -1082,7 +1082,7 @@ ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <16 x i64>, <16 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> %op) + %res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op) ret i64 %res } @@ -1094,7 +1094,7 @@ ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <32 x i64>, <32 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.umax.v32i64(<32 x i64> %op) + %res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op) ret i64 %res } @@ -1107,7 +1107,7 @@ ; CHECK-LABEL: uminv_v8i8: ; CHECK: uminv b0, v0.8b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %a) + %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a) ret i8 %res } @@ -1116,7 +1116,7 @@ ; CHECK-LABEL: uminv_v16i8: ; CHECK: uminv b0, v0.16b ; CHECK: ret - %res = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %a) + %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a) ret i8 %res } @@ -1128,7 +1128,7 @@ ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <32 x i8>, <32 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> %op) + %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op) ret i8 %res } @@ -1140,7 +1140,7 @@ ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <64 x i8>, <64 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> %op) + %res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op) ret i8 %res } @@ -1152,7 +1152,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <128 x i8>, <128 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> %op) + %res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op) ret i8 %res } @@ -1164,7 +1164,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <256 x i8>, <256 x i8>* %a - %res = call i8 @llvm.experimental.vector.reduce.umin.v256i8(<256 x i8> %op) + %res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op) ret i8 %res } @@ -1173,7 +1173,7 @@ ; CHECK-LABEL: uminv_v4i16: ; CHECK: uminv h0, v0.4h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %a) + %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a) ret i16 %res } @@ -1182,7 +1182,7 @@ ; CHECK-LABEL: uminv_v8i16: ; CHECK: uminv h0, v0.8h ; CHECK: ret - %res = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %a) + %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a) ret i16 %res } @@ -1194,7 +1194,7 @@ ; VBITS_GE_256-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <16 x i16>, <16 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %op) + %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op) ret i16 %res } @@ -1206,7 +1206,7 @@ ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <32 x i16>, <32 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> %op) + %res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op) ret i16 %res } @@ -1218,7 +1218,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <64 x i16>, <64 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> %op) + %res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op) ret i16 %res } @@ -1230,7 +1230,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <128 x i16>, <128 x i16>* %a - %res = call i16 @llvm.experimental.vector.reduce.umin.v128i16(<128 x i16> %op) + %res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op) ret i16 %res } @@ -1239,7 +1239,7 @@ ; CHECK-LABEL: uminv_v2i32: ; CHECK: minp v0.2s, v0.2s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a) + %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a) ret i32 %res } @@ -1248,7 +1248,7 @@ ; CHECK-LABEL: uminv_v4i32: ; CHECK: uminv s0, v0.4s ; CHECK: ret - %res = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a) + %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) ret i32 %res } @@ -1260,7 +1260,7 @@ ; VBITS_GE_256-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <8 x i32>, <8 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %op) + %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op) ret i32 %res } @@ -1272,7 +1272,7 @@ ; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <16 x i32>, <16 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %op) + %res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op) ret i32 %res } @@ -1284,7 +1284,7 @@ ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <32 x i32>, <32 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> %op) + %res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op) ret i32 %res } @@ -1296,7 +1296,7 @@ ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <64 x i32>, <64 x i32>* %a - %res = call i32 @llvm.experimental.vector.reduce.umin.v64i32(<64 x i32> %op) + %res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op) ret i32 %res } @@ -1305,7 +1305,7 @@ ; CHECK-LABEL: uminv_v1i64: ; CHECK: fmov x0, d0 ; CHECK: ret - %res = call i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64> %a) + %res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a) ret i64 %res } @@ -1316,7 +1316,7 @@ ; CHECK-NEXT: uminv [[REDUCE:d[0-9]+]], [[PG]], z0.d ; CHECK-NEXT: fmov x0, [[REDUCE]] ; CHECK-NEXT: ret - %res = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %a) + %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a) ret i64 %res } @@ -1328,7 +1328,7 @@ ; VBITS_GE_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_256-NEXT: ret %op = load <4 x i64>, <4 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> %op) + %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op) ret i64 %res } @@ -1340,7 +1340,7 @@ ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_512-NEXT: ret %op = load <8 x i64>, <8 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %op) + %res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op) ret i64 %res } @@ -1352,7 +1352,7 @@ ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_1024-NEXT: ret %op = load <16 x i64>, <16 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> %op) + %res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op) ret i64 %res } @@ -1364,148 +1364,148 @@ ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] ; VBITS_GE_2048-NEXT: ret %op = load <32 x i64>, <32 x i64>* %a - %res = call i64 @llvm.experimental.vector.reduce.umin.v32i64(<32 x i64> %op) + %res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op) ret i64 %res } attributes #0 = { "target-features"="+sve" } -declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v256i8(<256 x i8>) - -declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v128i16(<128 x i16>) - -declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v64i32(<64 x i32>) - -declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v32i64(<32 x i64>) - -declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v256i8(<256 x i8>) - -declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v128i16(<128 x i16>) - -declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v64i32(<64 x i32>) - -declare i64 @llvm.experimental.vector.reduce.smax.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v32i64(<32 x i64>) - -declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v256i8(<256 x i8>) - -declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v128i16(<128 x i16>) - -declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v64i32(<64 x i32>) - -declare i64 @llvm.experimental.vector.reduce.smin.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v32i64(<32 x i64>) - -declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v256i8(<256 x i8>) - -declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v128i16(<128 x i16>) - -declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v64i32(<64 x i32>) - -declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v32i64(<32 x i64>) - -declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v256i8(<256 x i8>) - -declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v128i16(<128 x i16>) - -declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v64i32(<64 x i32>) - -declare i64 @llvm.experimental.vector.reduce.umin.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v32i64(<32 x i64>) +declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>) + +declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>) + +declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) + +declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>) + +declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>) + +declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>) + +declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>) + +declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>) + +declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>) + +declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>) + +declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>) + +declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>) + +declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>) + +declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>) + +declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>) + +declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>) + +declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>) + +declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>) + +declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>) + +declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll @@ -1,28 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare i1 @llvm.experimental.vector.reduce.add.v1i1(<1 x i1> %a) -declare i8 @llvm.experimental.vector.reduce.add.v1i8(<1 x i8> %a) -declare i16 @llvm.experimental.vector.reduce.add.v1i16(<1 x i16> %a) -declare i24 @llvm.experimental.vector.reduce.add.v1i24(<1 x i24> %a) -declare i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a) -declare i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> %a) -declare i128 @llvm.experimental.vector.reduce.add.v1i128(<1 x i128> %a) - -declare i8 @llvm.experimental.vector.reduce.add.v3i8(<3 x i8> %a) -declare i8 @llvm.experimental.vector.reduce.add.v9i8(<9 x i8> %a) -declare i32 @llvm.experimental.vector.reduce.add.v3i32(<3 x i32> %a) -declare i1 @llvm.experimental.vector.reduce.add.v4i1(<4 x i1> %a) -declare i24 @llvm.experimental.vector.reduce.add.v4i24(<4 x i24> %a) -declare i128 @llvm.experimental.vector.reduce.add.v2i128(<2 x i128> %a) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a) +declare i1 @llvm.vector.reduce.add.v1i1(<1 x i1> %a) +declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a) +declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a) +declare i24 @llvm.vector.reduce.add.v1i24(<1 x i24> %a) +declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a) +declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a) +declare i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a) + +declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %a) +declare i8 @llvm.vector.reduce.add.v9i8(<9 x i8> %a) +declare i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %a) +declare i1 @llvm.vector.reduce.add.v4i1(<4 x i1> %a) +declare i24 @llvm.vector.reduce.add.v4i24(<4 x i24> %a) +declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a) define i1 @test_v1i1(<1 x i1> %a) nounwind { ; CHECK-LABEL: test_v1i1: ; CHECK: // %bb.0: ; CHECK-NEXT: and w0, w0, #0x1 ; CHECK-NEXT: ret - %b = call i1 @llvm.experimental.vector.reduce.add.v1i1(<1 x i1> %a) + %b = call i1 @llvm.vector.reduce.add.v1i1(<1 x i1> %a) ret i1 %b } @@ -32,7 +32,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret - %b = call i8 @llvm.experimental.vector.reduce.add.v1i8(<1 x i8> %a) + %b = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a) ret i8 %b } @@ -42,7 +42,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret - %b = call i16 @llvm.experimental.vector.reduce.add.v1i16(<1 x i16> %a) + %b = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a) ret i16 %b } @@ -50,7 +50,7 @@ ; CHECK-LABEL: test_v1i24: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call i24 @llvm.experimental.vector.reduce.add.v1i24(<1 x i24> %a) + %b = call i24 @llvm.vector.reduce.add.v1i24(<1 x i24> %a) ret i24 %b } @@ -60,7 +60,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a) + %b = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a) ret i32 %b } @@ -70,7 +70,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret - %b = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> %a) + %b = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a) ret i64 %b } @@ -78,7 +78,7 @@ ; CHECK-LABEL: test_v1i128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call i128 @llvm.experimental.vector.reduce.add.v1i128(<1 x i128> %a) + %b = call i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a) ret i128 %b } @@ -92,7 +92,7 @@ ; CHECK-NEXT: addv h0, v0.4h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i8 @llvm.experimental.vector.reduce.add.v3i8(<3 x i8> %a) + %b = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %a) ret i8 %b } @@ -109,7 +109,7 @@ ; CHECK-NEXT: addv b0, v0.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i8 @llvm.experimental.vector.reduce.add.v9i8(<9 x i8> %a) + %b = call i8 @llvm.vector.reduce.add.v9i8(<9 x i8> %a) ret i8 %b } @@ -120,7 +120,7 @@ ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i32 @llvm.experimental.vector.reduce.add.v3i32(<3 x i32> %a) + %b = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %a) ret i32 %b } @@ -131,7 +131,7 @@ ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret - %b = call i1 @llvm.experimental.vector.reduce.add.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> %a) ret i1 %b } @@ -141,7 +141,7 @@ ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i24 @llvm.experimental.vector.reduce.add.v4i24(<4 x i24> %a) + %b = call i24 @llvm.vector.reduce.add.v4i24(<4 x i24> %a) ret i24 %b } @@ -151,7 +151,7 @@ ; CHECK-NEXT: adds x0, x0, x2 ; CHECK-NEXT: adcs x1, x1, x3 ; CHECK-NEXT: ret - %b = call i128 @llvm.experimental.vector.reduce.add.v2i128(<2 x i128> %a) + %b = call i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a) ret i128 %b } @@ -164,6 +164,6 @@ ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a) + %b = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a) ret i32 %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -1,28 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %a) -declare i8 @llvm.experimental.vector.reduce.and.v1i8(<1 x i8> %a) -declare i16 @llvm.experimental.vector.reduce.and.v1i16(<1 x i16> %a) -declare i24 @llvm.experimental.vector.reduce.and.v1i24(<1 x i24> %a) -declare i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a) -declare i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %a) -declare i128 @llvm.experimental.vector.reduce.and.v1i128(<1 x i128> %a) - -declare i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> %a) -declare i8 @llvm.experimental.vector.reduce.and.v9i8(<9 x i8> %a) -declare i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> %a) -declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a) -declare i24 @llvm.experimental.vector.reduce.and.v4i24(<4 x i24> %a) -declare i128 @llvm.experimental.vector.reduce.and.v2i128(<2 x i128> %a) -declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a) +declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %a) +declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %a) +declare i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %a) +declare i24 @llvm.vector.reduce.and.v1i24(<1 x i24> %a) +declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a) +declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a) +declare i128 @llvm.vector.reduce.and.v1i128(<1 x i128> %a) + +declare i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %a) +declare i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a) +declare i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a) +declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) +declare i24 @llvm.vector.reduce.and.v4i24(<4 x i24> %a) +declare i128 @llvm.vector.reduce.and.v2i128(<2 x i128> %a) +declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a) define i1 @test_v1i1(<1 x i1> %a) nounwind { ; CHECK-LABEL: test_v1i1: ; CHECK: // %bb.0: ; CHECK-NEXT: and w0, w0, #0x1 ; CHECK-NEXT: ret - %b = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %a) ret i1 %b } @@ -32,7 +32,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret - %b = call i8 @llvm.experimental.vector.reduce.and.v1i8(<1 x i8> %a) + %b = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %a) ret i8 %b } @@ -42,7 +42,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret - %b = call i16 @llvm.experimental.vector.reduce.and.v1i16(<1 x i16> %a) + %b = call i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %a) ret i16 %b } @@ -50,7 +50,7 @@ ; CHECK-LABEL: test_v1i24: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call i24 @llvm.experimental.vector.reduce.and.v1i24(<1 x i24> %a) + %b = call i24 @llvm.vector.reduce.and.v1i24(<1 x i24> %a) ret i24 %b } @@ -60,7 +60,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a) + %b = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a) ret i32 %b } @@ -70,7 +70,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret - %b = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %a) + %b = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a) ret i64 %b } @@ -78,7 +78,7 @@ ; CHECK-LABEL: test_v1i128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call i128 @llvm.experimental.vector.reduce.and.v1i128(<1 x i128> %a) + %b = call i128 @llvm.vector.reduce.and.v1i128(<1 x i128> %a) ret i128 %b } @@ -89,7 +89,7 @@ ; CHECK-NEXT: and w8, w8, w2 ; CHECK-NEXT: and w0, w8, #0xff ; CHECK-NEXT: ret - %b = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> %a) + %b = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %a) ret i8 %b } @@ -120,7 +120,7 @@ ; CHECK-NEXT: umov w9, v0.b[7] ; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret - %b = call i8 @llvm.experimental.vector.reduce.and.v9i8(<9 x i8> %a) + %b = call i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a) ret i8 %b } @@ -133,7 +133,7 @@ ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: and w0, w9, w8 ; CHECK-NEXT: ret - %b = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> %a) + %b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a) ret i32 %b } @@ -150,7 +150,7 @@ ; CHECK-NEXT: and w8, w9, w8 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret - %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } @@ -163,7 +163,7 @@ ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: and w0, w9, w8 ; CHECK-NEXT: ret - %b = call i24 @llvm.experimental.vector.reduce.and.v4i24(<4 x i24> %a) + %b = call i24 @llvm.vector.reduce.and.v4i24(<4 x i24> %a) ret i24 %b } @@ -173,7 +173,7 @@ ; CHECK-NEXT: and x0, x0, x2 ; CHECK-NEXT: and x1, x1, x3 ; CHECK-NEXT: ret - %b = call i128 @llvm.experimental.vector.reduce.and.v2i128(<2 x i128> %a) + %b = call i128 @llvm.vector.reduce.and.v2i128(<2 x i128> %a) ret i128 %b } @@ -189,6 +189,6 @@ ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: and w0, w9, w8 ; CHECK-NEXT: ret - %b = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a) + %b = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a) ret i32 %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a) +declare i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %a) +declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) +declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) +declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) +declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) +declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a) -declare i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a) +declare i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %a) +declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a) +declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a) +declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) +declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) +declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a) define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_and_v1: @@ -24,7 +24,7 @@ ; CHECK-NEXT: csel w0, w0, w1, lt ; CHECK-NEXT: ret %x = icmp slt <1 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> %x) + %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -41,7 +41,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <2 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %x) + %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -58,7 +58,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <4 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %x) + %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -73,7 +73,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <8 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %x) + %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -88,7 +88,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <16 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %x) + %y = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -105,7 +105,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <32 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %x) + %y = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -119,7 +119,7 @@ ; CHECK-NEXT: csel w0, w0, w1, lt ; CHECK-NEXT: ret %x = icmp slt <1 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> %x) + %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -136,7 +136,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <2 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %x) + %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -153,7 +153,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <4 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %x) + %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -168,7 +168,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <8 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %x) + %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -183,7 +183,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <16 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %x) + %y = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } @@ -200,7 +200,7 @@ ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <32 x i8> %a0, zeroinitializer - %y = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %x) + %y = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %x) %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll @@ -3,14 +3,14 @@ ; Same as vecreduce-fadd-legalization.ll, but without fmf. -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>) -declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>) +declare half @llvm.vector.reduce.fadd.f16.v1f16(half, <1 x half>) +declare float @llvm.vector.reduce.fadd.f32.v1f32(float, <1 x float>) +declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>) +declare fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>) -declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fadd.f32.v3f32(float, <3 x float>) +declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>) +declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: @@ -20,7 +20,7 @@ ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: ret - %b = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a) + %b = call half @llvm.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a) ret half %b } @@ -31,7 +31,7 @@ ; CHECK-NEXT: fmov s1, wzr ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret - %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a) + %b = call float @llvm.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a) ret float %b } @@ -41,7 +41,7 @@ ; CHECK-NEXT: fmov d1, xzr ; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret - %b = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a) + %b = call double @llvm.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a) ret double %b } @@ -54,7 +54,7 @@ ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) ret fp128 %b } @@ -68,7 +68,7 @@ ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: fadd s0, s1, s0 ; CHECK-NEXT: ret - %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a) + %b = call float @llvm.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) ret float %b } @@ -86,7 +86,7 @@ ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #32 // =32 ; CHECK-NEXT: ret - %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) ret fp128 %b } @@ -123,6 +123,6 @@ ; CHECK-NEXT: mov s1, v3.s[3] ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret - %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a) + %b = call float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -1,20 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>) -declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>) +declare half @llvm.vector.reduce.fadd.f16.v1f16(half, <1 x half>) +declare float @llvm.vector.reduce.fadd.f32.v1f32(float, <1 x float>) +declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>) +declare fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>) -declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fadd.f32.v3f32(float, <3 x float>) +declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>) +declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a) + %b = call fast nnan half @llvm.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a) ret half %b } @@ -24,7 +24,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a) + %b = call fast nnan float @llvm.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a) ret float %b } @@ -32,7 +32,7 @@ ; CHECK-LABEL: test_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a) + %b = call fast nnan double @llvm.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a) ret double %b } @@ -40,7 +40,7 @@ ; CHECK-LABEL: test_v1f128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) + %b = call fast nnan fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) ret fp128 %b } @@ -53,7 +53,7 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a) + %b = call fast nnan float @llvm.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) ret float %b } @@ -64,7 +64,7 @@ ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - %b = call fast nnan fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + %b = call fast nnan fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) ret fp128 %b } @@ -78,6 +78,6 @@ ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %b = call fast nnan float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a) + %b = call fast nnan float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll @@ -14,7 +14,7 @@ ; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECKNOFP16-NEXT: faddp s0, v0.2s ; CHECKNOFP16-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx) + %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %bin.rdx) ret float %r } @@ -48,7 +48,7 @@ ; CHECKNOFP16-NEXT: fadd s0, s0, s1 ; CHECKNOFP16-NEXT: fcvt h0, s0 ; CHECKNOFP16-NEXT: ret - %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx) + %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half 0.0, <4 x half> %bin.rdx) ret half %r } @@ -103,7 +103,7 @@ ; CHECKNOFP16-NEXT: fadd s0, s0, s1 ; CHECKNOFP16-NEXT: fcvt h0, s0 ; CHECKNOFP16-NEXT: ret - %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx) + %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half 0.0, <8 x half> %bin.rdx) ret half %r } @@ -121,7 +121,7 @@ ; CHECKNOFP16-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECKNOFP16-NEXT: faddp s0, v0.2s ; CHECKNOFP16-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx) + %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %bin.rdx) ret float %r } @@ -135,7 +135,7 @@ ; CHECKNOFP16: // %bb.0: ; CHECKNOFP16-NEXT: faddp d0, v0.2d ; CHECKNOFP16-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx) + %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %bin.rdx) ret double %r } @@ -229,7 +229,7 @@ ; CHECKNOFP16-NEXT: fadd s0, s1, s0 ; CHECKNOFP16-NEXT: fcvt h0, s0 ; CHECKNOFP16-NEXT: ret - %r = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx) + %r = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half 0.0, <16 x half> %bin.rdx) ret half %r } @@ -249,7 +249,7 @@ ; CHECKNOFP16-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECKNOFP16-NEXT: faddp s0, v0.2s ; CHECKNOFP16-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx) + %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %bin.rdx) ret float %r } @@ -265,16 +265,16 @@ ; CHECKNOFP16-NEXT: fadd v0.2d, v0.2d, v1.2d ; CHECKNOFP16-NEXT: faddp d0, v0.2d ; CHECKNOFP16-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx) + %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %bin.rdx) ret double %r } ; Function Attrs: nounwind readnone -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>) -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>) -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) +declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>) +declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>) +declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>) +declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) +declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll @@ -1,20 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a) -declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a) -declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a) -declare fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a) +declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a) +declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a) +declare double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a) +declare fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a) -declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a) -declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) -declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a) +declare float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a) +declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a) +declare float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a) + %b = call half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a) ret half %b } @@ -24,7 +24,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret - %b = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a) + %b = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a) ret float %b } @@ -32,7 +32,7 @@ ; CHECK-LABEL: test_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a) + %b = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a) ret double %b } @@ -40,14 +40,14 @@ ; CHECK-LABEL: test_v1f128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a) ret fp128 %b } ; TODO: This doesn't work, because ExpandReductions only supports power of two ; unordered reductions. ;define float @test_v3f32(<3 x float> %a) nounwind { -; %b = call float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a) +; %b = call float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a) ; ret float %b ;} @@ -55,7 +55,7 @@ ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: ; CHECK-NEXT: b fmaxl - %b = call fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b } @@ -67,6 +67,6 @@ ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s ; CHECK-NEXT: fmaxnmv s0, v0.4s ; CHECK-NEXT: ret - %b = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a) + %b = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll @@ -1,20 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a) -declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a) -declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a) -declare fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a) +declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a) +declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a) +declare double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a) +declare fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a) -declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a) -declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) -declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a) +declare float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a) +declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a) +declare float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call nnan half @llvm.experimental.vector.reduce.fmax.v1f16(<1 x half> %a) + %b = call nnan half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a) ret half %b } @@ -24,7 +24,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret - %b = call nnan float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a) + %b = call nnan float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a) ret float %b } @@ -32,7 +32,7 @@ ; CHECK-LABEL: test_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call nnan double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %a) + %b = call nnan double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a) ret double %b } @@ -40,7 +40,7 @@ ; CHECK-LABEL: test_v1f128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call nnan fp128 @llvm.experimental.vector.reduce.fmax.v1f128(<1 x fp128> %a) + %b = call nnan fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a) ret fp128 %b } @@ -52,7 +52,7 @@ ; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: fmaxnmv s0, v0.4s ; CHECK-NEXT: ret - %b = call nnan float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a) + %b = call nnan float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a) ret float %b } @@ -64,7 +64,7 @@ ; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: fmaxnmv s0, v0.4s ; CHECK-NEXT: ret - %b = call nnan ninf float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a) + %b = call nnan ninf float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a) ret float %b } @@ -72,7 +72,7 @@ ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: ; CHECK-NEXT: b fmaxl - %b = call nnan fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) + %b = call nnan fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b } @@ -84,6 +84,6 @@ ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s ; CHECK-NEXT: fmaxnmv s0, v0.4s ; CHECK-NEXT: ret - %b = call nnan float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a) + %b = call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll @@ -1,20 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.fmin.v1f16(<1 x half> %a) -declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a) -declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a) -declare fp128 @llvm.experimental.vector.reduce.fmin.v1f128(<1 x fp128> %a) +declare half @llvm.vector.reduce.fmin.v1f16(<1 x half> %a) +declare float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a) +declare double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a) +declare fp128 @llvm.vector.reduce.fmin.v1f128(<1 x fp128> %a) -declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a) -declare fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a) -declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a) +declare float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a) +declare fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a) +declare float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call nnan half @llvm.experimental.vector.reduce.fmin.v1f16(<1 x half> %a) + %b = call nnan half @llvm.vector.reduce.fmin.v1f16(<1 x half> %a) ret half %b } @@ -24,7 +24,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret - %b = call nnan float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a) + %b = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a) ret float %b } @@ -32,7 +32,7 @@ ; CHECK-LABEL: test_v1f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call nnan double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %a) + %b = call nnan double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a) ret double %b } @@ -40,7 +40,7 @@ ; CHECK-LABEL: test_v1f128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call nnan fp128 @llvm.experimental.vector.reduce.fmin.v1f128(<1 x fp128> %a) + %b = call nnan fp128 @llvm.vector.reduce.fmin.v1f128(<1 x fp128> %a) ret fp128 %b } @@ -52,7 +52,7 @@ ; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: fminnmv s0, v0.4s ; CHECK-NEXT: ret - %b = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a) + %b = call nnan float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a) ret float %b } @@ -64,7 +64,7 @@ ; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: fminnmv s0, v0.4s ; CHECK-NEXT: ret - %b = call nnan ninf float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a) + %b = call nnan ninf float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a) ret float %b } @@ -72,7 +72,7 @@ ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: ; CHECK-NEXT: b fminl - %b = call nnan fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a) + %b = call nnan fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a) ret fp128 %b } @@ -84,6 +84,6 @@ ; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s ; CHECK-NEXT: fminnmv s0, v0.4s ; CHECK-NEXT: ret - %b = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a) + %b = call nnan float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll @@ -3,14 +3,14 @@ ; Same as vecreduce-fmul-legalization.ll, but without fmf. -declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half, <1 x half>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float, <1 x float>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>) -declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128, <1 x fp128>) +declare half @llvm.vector.reduce.fmul.f16.v1f16(half, <1 x half>) +declare float @llvm.vector.reduce.fmul.f32.v1f32(float, <1 x float>) +declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>) +declare fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128, <1 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float, <3 x float>) -declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fmul.f32.v3f32(float, <3 x float>) +declare fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128, <2 x fp128>) +declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: @@ -20,7 +20,7 @@ ; CHECK-NEXT: fmul s0, s0, s1 ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: ret - %b = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half 0.0, <1 x half> %a) + %b = call half @llvm.vector.reduce.fmul.f16.v1f16(half 0.0, <1 x half> %a) ret half %b } @@ -31,7 +31,7 @@ ; CHECK-NEXT: fmov s1, wzr ; CHECK-NEXT: fmul s0, s1, v0.s[0] ; CHECK-NEXT: ret - %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float 0.0, <1 x float> %a) + %b = call float @llvm.vector.reduce.fmul.f32.v1f32(float 0.0, <1 x float> %a) ret float %b } @@ -41,7 +41,7 @@ ; CHECK-NEXT: fmov d1, xzr ; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: ret - %b = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double 0.0, <1 x double> %a) + %b = call double @llvm.vector.reduce.fmul.f64.v1f64(double 0.0, <1 x double> %a) ret double %b } @@ -54,7 +54,7 @@ ; CHECK-NEXT: bl __multf3 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret - %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) ret fp128 %b } @@ -66,7 +66,7 @@ ; CHECK-NEXT: fmul s1, s1, v0.s[1] ; CHECK-NEXT: fmul s0, s1, v0.s[2] ; CHECK-NEXT: ret - %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float 0.0, <3 x float> %a) + %b = call float @llvm.vector.reduce.fmul.f32.v3f32(float 0.0, <3 x float> %a) ret float %b } @@ -84,7 +84,7 @@ ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #32 // =32 ; CHECK-NEXT: ret - %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) ret fp128 %b } @@ -109,6 +109,6 @@ ; CHECK-NEXT: fmul s0, s0, v3.s[2] ; CHECK-NEXT: fmul s0, s0, v3.s[3] ; CHECK-NEXT: ret - %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 0.0, <16 x float> %a) + %b = call float @llvm.vector.reduce.fmul.f32.v16f32(float 0.0, <16 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll @@ -24,8 +24,8 @@ %1 = insertelement <4 x double> %0, double 1.0, i32 1 %2 = insertelement <4 x double> %1, double 1.0, i32 2 %3 = insertelement <4 x double> %2, double 1.0, i32 3 - %4 = call nnan reassoc double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %3) + %4 = call nnan reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> %3) ret double %4 } -declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -1,29 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare i1 @llvm.experimental.vector.reduce.umax.v1i1(<1 x i1> %a) -declare i8 @llvm.experimental.vector.reduce.umax.v1i8(<1 x i8> %a) -declare i16 @llvm.experimental.vector.reduce.umax.v1i16(<1 x i16> %a) -declare i24 @llvm.experimental.vector.reduce.umax.v1i24(<1 x i24> %a) -declare i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a) -declare i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> %a) -declare i128 @llvm.experimental.vector.reduce.umax.v1i128(<1 x i128> %a) - -declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a) -declare i8 @llvm.experimental.vector.reduce.umax.v3i8(<3 x i8> %a) -declare i8 @llvm.experimental.vector.reduce.umax.v9i8(<9 x i8> %a) -declare i32 @llvm.experimental.vector.reduce.umax.v3i32(<3 x i32> %a) -declare i1 @llvm.experimental.vector.reduce.umax.v4i1(<4 x i1> %a) -declare i24 @llvm.experimental.vector.reduce.umax.v4i24(<4 x i24> %a) -declare i128 @llvm.experimental.vector.reduce.umax.v2i128(<2 x i128> %a) -declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a) +declare i1 @llvm.vector.reduce.umax.v1i1(<1 x i1> %a) +declare i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %a) +declare i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %a) +declare i24 @llvm.vector.reduce.umax.v1i24(<1 x i24> %a) +declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a) +declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a) +declare i128 @llvm.vector.reduce.umax.v1i128(<1 x i128> %a) + +declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a) +declare i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> %a) +declare i8 @llvm.vector.reduce.umax.v9i8(<9 x i8> %a) +declare i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a) +declare i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a) +declare i24 @llvm.vector.reduce.umax.v4i24(<4 x i24> %a) +declare i128 @llvm.vector.reduce.umax.v2i128(<2 x i128> %a) +declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a) define i1 @test_v1i1(<1 x i1> %a) nounwind { ; CHECK-LABEL: test_v1i1: ; CHECK: // %bb.0: ; CHECK-NEXT: and w0, w0, #0x1 ; CHECK-NEXT: ret - %b = call i1 @llvm.experimental.vector.reduce.umax.v1i1(<1 x i1> %a) + %b = call i1 @llvm.vector.reduce.umax.v1i1(<1 x i1> %a) ret i1 %b } @@ -33,7 +33,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret - %b = call i8 @llvm.experimental.vector.reduce.umax.v1i8(<1 x i8> %a) + %b = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %a) ret i8 %b } @@ -43,7 +43,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret - %b = call i16 @llvm.experimental.vector.reduce.umax.v1i16(<1 x i16> %a) + %b = call i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %a) ret i16 %b } @@ -51,7 +51,7 @@ ; CHECK-LABEL: test_v1i24: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call i24 @llvm.experimental.vector.reduce.umax.v1i24(<1 x i24> %a) + %b = call i24 @llvm.vector.reduce.umax.v1i24(<1 x i24> %a) ret i24 %b } @@ -61,7 +61,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a) + %b = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a) ret i32 %b } @@ -71,7 +71,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret - %b = call i64 @llvm.experimental.vector.reduce.umax.v1i64(<1 x i64> %a) + %b = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a) ret i64 %b } @@ -79,7 +79,7 @@ ; CHECK-LABEL: test_v1i128: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %b = call i128 @llvm.experimental.vector.reduce.umax.v1i128(<1 x i128> %a) + %b = call i128 @llvm.vector.reduce.umax.v1i128(<1 x i128> %a) ret i128 %b } @@ -92,7 +92,7 @@ ; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: csel x0, x9, x8, hi ; CHECK-NEXT: ret - %b = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a) + %b = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a) ret i64 %b } @@ -107,7 +107,7 @@ ; CHECK-NEXT: umaxv h0, v0.4h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i8 @llvm.experimental.vector.reduce.umax.v3i8(<3 x i8> %a) + %b = call i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> %a) ret i8 %b } @@ -124,7 +124,7 @@ ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i8 @llvm.experimental.vector.reduce.umax.v9i8(<9 x i8> %a) + %b = call i8 @llvm.vector.reduce.umax.v9i8(<9 x i8> %a) ret i8 %b } @@ -135,7 +135,7 @@ ; CHECK-NEXT: umaxv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i32 @llvm.experimental.vector.reduce.umax.v3i32(<3 x i32> %a) + %b = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a) ret i32 %b } @@ -148,7 +148,7 @@ ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret - %b = call i1 @llvm.experimental.vector.reduce.umax.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a) ret i1 %b } @@ -159,7 +159,7 @@ ; CHECK-NEXT: umaxv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i24 @llvm.experimental.vector.reduce.umax.v4i24(<4 x i24> %a) + %b = call i24 @llvm.vector.reduce.umax.v4i24(<4 x i24> %a) ret i24 %b } @@ -173,7 +173,7 @@ ; CHECK-NEXT: csel x0, x8, x9, eq ; CHECK-NEXT: csel x1, x1, x3, hi ; CHECK-NEXT: ret - %b = call i128 @llvm.experimental.vector.reduce.umax.v2i128(<2 x i128> %a) + %b = call i128 @llvm.vector.reduce.umax.v2i128(<2 x i128> %a) ret i128 %b } @@ -186,6 +186,6 @@ ; CHECK-NEXT: umaxv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret - %b = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a) + %b = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a) ret i32 %b } diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) -declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>) +declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) +declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>) define half @test_v4f16(<4 x half> %a) nounwind { ; CHECK-LABEL: test_v4f16: @@ -37,7 +37,7 @@ ; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half 0.0, <4 x half> %a) + %b = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half 0.0, <4 x half> %a) ret half %b } @@ -55,7 +55,7 @@ ; CHECK-NEXT: bl __aeabi_fadd ; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a) + %b = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a) ret float %b } @@ -67,7 +67,7 @@ ; CHECK-NEXT: bl __aeabi_dadd ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double zeroinitializer, <2 x double> %a) + %b = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double zeroinitializer, <2 x double> %a) ret double %b } @@ -90,6 +90,6 @@ ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + %b = call fast fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half, <1 x half>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float, <1 x float>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>) -declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128, <1 x fp128>) +declare half @llvm.vector.reduce.fadd.f16.v1f16(half, <1 x half>) +declare float @llvm.vector.reduce.fadd.f32.v1f32(float, <1 x float>) +declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>) +declare fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float, <3 x float>) -declare fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128, <2 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fadd.f32.v3f32(float, <3 x float>) +declare fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>) +declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: @@ -28,7 +28,7 @@ ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI0_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 - %b = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v1f16(half 0.0, <1 x half> %a) + %b = call half @llvm.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a) ret half %b } @@ -44,7 +44,7 @@ ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI1_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 - %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v1f32(float 0.0, <1 x float> %a) + %b = call float @llvm.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a) ret float %b } @@ -56,7 +56,7 @@ ; CHECK-NEXT: vadd.f64 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr - %b = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double 0.0, <1 x double> %a) + %b = call double @llvm.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a) ret double %b } @@ -76,7 +76,7 @@ ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) ret fp128 %b } @@ -95,7 +95,7 @@ ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI4_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 - %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v3f32(float 0.0, <3 x float> %a) + %b = call float @llvm.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) ret float %b } @@ -124,7 +124,7 @@ ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fp128 @llvm.experimental.vector.reduce.v2.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) ret fp128 %b } @@ -162,6 +162,6 @@ ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI6_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 - %b = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a) + %b = call float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>) -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) -declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) -declare fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128>) +declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) +declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128>) define half @test_v4f16(<4 x half> %a) nounwind { ; CHECK-LABEL: test_v4f16: @@ -37,7 +37,7 @@ ; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %a) + %b = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a) ret half %b } @@ -55,7 +55,7 @@ ; CHECK-NEXT: bl fmaxf ; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a) + %b = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) ret float %b } @@ -67,7 +67,7 @@ ; CHECK-NEXT: bl fmax ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a) + %b = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) ret double %b } @@ -90,6 +90,6 @@ ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast fp128 @llvm.experimental.vector.reduce.fmax.v2f128(<2 x fp128> %a) + %b = call fast fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>) -declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) -declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) -declare fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128>) +declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) +declare fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128>) define half @test_v4f16(<4 x half> %a) nounwind { ; CHECK-LABEL: test_v4f16: @@ -37,7 +37,7 @@ ; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %a) + %b = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a) ret half %b } @@ -55,7 +55,7 @@ ; CHECK-NEXT: bl fminf ; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a) + %b = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) ret float %b } @@ -67,7 +67,7 @@ ; CHECK-NEXT: bl fmin ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a) + %b = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) ret double %b } @@ -90,6 +90,6 @@ ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast fp128 @llvm.experimental.vector.reduce.fmin.v2f128(<2 x fp128> %a) + %b = call fast fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=-neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>) -declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>) +declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>) +declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) +declare fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128, <2 x fp128>) define half @test_v4f16(<4 x half> %a) nounwind { ; CHECK-LABEL: test_v4f16: @@ -37,7 +37,7 @@ ; CHECK-NEXT: bl __aeabi_f2h ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half 1.0, <4 x half> %a) + %b = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %a) ret half %b } @@ -55,7 +55,7 @@ ; CHECK-NEXT: bl __aeabi_fmul ; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a) + %b = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) ret float %b } @@ -67,7 +67,7 @@ ; CHECK-NEXT: bl __aeabi_dmul ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a) + %b = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a) ret double %b } @@ -90,6 +90,6 @@ ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fast fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a) + %b = call fast fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon | FileCheck %s --check-prefix=CHECK -declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half, <1 x half>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float, <1 x float>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>) -declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128, <1 x fp128>) +declare half @llvm.vector.reduce.fmul.f16.v1f16(half, <1 x half>) +declare float @llvm.vector.reduce.fmul.f32.v1f32(float, <1 x float>) +declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>) +declare fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128, <1 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float, <3 x float>) -declare fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128, <2 x fp128>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fmul.f32.v3f32(float, <3 x float>) +declare fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128, <2 x fp128>) +declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>) define half @test_v1f16(<1 x half> %a) nounwind { ; CHECK-LABEL: test_v1f16: @@ -28,7 +28,7 @@ ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI0_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 - %b = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v1f16(half 0.0, <1 x half> %a) + %b = call half @llvm.vector.reduce.fmul.f16.v1f16(half 0.0, <1 x half> %a) ret half %b } @@ -44,7 +44,7 @@ ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI1_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 - %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v1f32(float 0.0, <1 x float> %a) + %b = call float @llvm.vector.reduce.fmul.f32.v1f32(float 0.0, <1 x float> %a) ret float %b } @@ -56,7 +56,7 @@ ; CHECK-NEXT: vmul.f64 d16, d17, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr - %b = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double 0.0, <1 x double> %a) + %b = call double @llvm.vector.reduce.fmul.f64.v1f64(double 0.0, <1 x double> %a) ret double %b } @@ -76,7 +76,7 @@ ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fmul.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) ret fp128 %b } @@ -95,7 +95,7 @@ ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI4_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 - %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v3f32(float 0.0, <3 x float> %a) + %b = call float @llvm.vector.reduce.fmul.f32.v3f32(float 0.0, <3 x float> %a) ret float %b } @@ -124,7 +124,7 @@ ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr - %b = call fp128 @llvm.experimental.vector.reduce.v2.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) ret fp128 %b } @@ -162,6 +162,6 @@ ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI6_0: ; CHECK-NEXT: .long 0x00000000 @ float 0 - %b = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 0.0, <16 x float> %a) + %b = call float @llvm.vector.reduce.fmul.f32.v16f32(float 0.0, <16 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll --- a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll +++ b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -1,24 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -expand-reductions -S | FileCheck %s ; Tests without a target which should expand all reductions -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) -declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) -declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) -declare i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8>) +declare i8 @llvm.vector.reduce.and.i8.v3i8(<3 x i8>) define i64 @add_i64(<2 x i64> %vec) { ; CHECK-LABEL: @add_i64( @@ -29,7 +29,7 @@ ; CHECK-NEXT: ret i64 [[TMP0]] ; entry: - %r = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %vec) + %r = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %vec) ret i64 %r } @@ -42,7 +42,7 @@ ; CHECK-NEXT: ret i64 [[TMP0]] ; entry: - %r = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %vec) + %r = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %vec) ret i64 %r } @@ -55,7 +55,7 @@ ; CHECK-NEXT: ret i64 [[TMP0]] ; entry: - %r = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %vec) + %r = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %vec) ret i64 %r } @@ -68,7 +68,7 @@ ; CHECK-NEXT: ret i64 [[TMP0]] ; entry: - %r = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %vec) + %r = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %vec) ret i64 %r } @@ -81,7 +81,7 @@ ; CHECK-NEXT: ret i64 [[TMP0]] ; entry: - %r = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %vec) + %r = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %vec) ret i64 %r } @@ -97,7 +97,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %vec) + %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %vec) ret float %r } @@ -113,7 +113,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec) + %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -131,7 +131,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %vec) + %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) ret float %r } @@ -149,7 +149,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %accum, <4 x float> %vec) + %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -165,7 +165,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %vec) + %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %vec) ret float %r } @@ -181,7 +181,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec) + %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -199,7 +199,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %vec) + %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) ret float %r } @@ -217,7 +217,7 @@ ; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: - %r = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %accum, <4 x float> %vec) + %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) ret float %r } @@ -231,7 +231,7 @@ ; CHECK-NEXT: ret i64 [[TMP0]] ; entry: - %r = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %vec) + %r = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %vec) ret i64 %r } @@ -245,7 +245,7 @@ ; CHECK-NEXT: ret i64 [[TMP0]] ; entry: - %r = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %vec) + %r = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %vec) ret i64 %r } @@ -259,7 +259,7 @@ ; CHECK-NEXT: ret i64 [[TMP0]] ; entry: - %r = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %vec) + %r = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %vec) ret i64 %r } @@ -273,7 +273,7 @@ ; CHECK-NEXT: ret i64 [[TMP0]] ; entry: - %r = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %vec) + %r = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %vec) ret i64 %r } @@ -282,11 +282,11 @@ define double @fmax_f64(<2 x double> %vec) { ; CHECK-LABEL: @fmax_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> [[VEC:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> [[VEC:%.*]]) ; CHECK-NEXT: ret double [[R]] ; entry: - %r = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %vec) + %r = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %vec) ret double %r } @@ -295,11 +295,11 @@ define double @fmin_f64(<2 x double> %vec) { ; CHECK-LABEL: @fmin_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> [[VEC:%.*]]) +; CHECK-NEXT: [[R:%.*]] = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> [[VEC:%.*]]) ; CHECK-NEXT: ret double [[R]] ; entry: - %r = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %vec) + %r = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %vec) ret double %r } @@ -309,10 +309,10 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind { ; CHECK-LABEL: @test_v3i8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[B:%.*]] = call i8 @llvm.experimental.vector.reduce.and.v3i8(<3 x i8> [[A:%.*]]) +; CHECK-NEXT: [[B:%.*]] = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> [[A:%.*]]) ; CHECK-NEXT: ret i8 [[B]] ; entry: - %b = call i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8> %a) + %b = call i8 @llvm.vector.reduce.and.i8.v3i8(<3 x i8> %a) ret i8 %b } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/add_reduce.mir @@ -44,7 +44,7 @@ %add7 = add <4 x i32> %mul, %splat.output %max = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %add7, <4 x i32> %.splat.i42, i32 1, <4 x i1> %pred, <4 x i32> undef) %min = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %max, <4 x i32> %.splat.i, i32 1, <4 x i1> %pred, <4 x i32> undef) - %reduce = tail call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %min) + %reduce = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %min) store i32 %reduce, i32* %scevgep2 %add.ptr = getelementptr inbounds i8, i8* %input_1_vect.addr.052, i32 4 %add.ptr14 = getelementptr inbounds i8, i8* %input_2_vect.addr.051, i32 4 @@ -62,7 +62,7 @@ declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 declare i1 @llvm.test.set.loop.iterations.i32(i32) #4 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #4 - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #5 + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #5 ... --- diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -85,7 +85,7 @@ middle.block: ; preds = %vector.body %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi - %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8) + %tmp9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp8) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -188,7 +188,7 @@ middle.block: ; preds = %vector.body %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi - %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc) + %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -287,7 +287,7 @@ middle.block: ; preds = %vector.body %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi - %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc) + %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -386,7 +386,7 @@ middle.block: ; preds = %vector.body %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi - %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc) + %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -528,6 +528,6 @@ declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) ; Function Attrs: nounwind readnone willreturn -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-1.mir @@ -56,7 +56,7 @@ br i1 %tmp16, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %tmp17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp14) + %tmp17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp14) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -64,7 +64,7 @@ ret i32 %res.0.lcssa } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 declare void @llvm.set.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inloop-vpsel-2.mir @@ -58,7 +58,7 @@ br i1 %tmp16, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %tmp17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp14) + %tmp17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp14) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -66,7 +66,7 @@ ret i32 %res.0.lcssa } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 declare void @llvm.set.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/invariant-qreg.mir @@ -68,7 +68,7 @@ %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32> %tmp12 = mul nsw <4 x i32> %pass, %tmp10 - %tmp13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp12) + %tmp13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp12) %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -105,7 +105,7 @@ %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32> %tmp12 = add nsw <4 x i32> %pass, %tmp10 - %tmp13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp12) + %tmp13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp12) %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 @@ -117,7 +117,7 @@ ret i32 %res } - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir @@ -40,7 +40,7 @@ br i1 %15, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %16 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13) + %16 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -88,7 +88,7 @@ br i1 %15, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %16 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13) + %16 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %13) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -98,7 +98,7 @@ declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) - declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) + declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/matrix.mir @@ -91,7 +91,7 @@ %22 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %10) %23 = bitcast i16* %lsr.iv7 to i1* %24 = select <4 x i1> %22, <4 x i32> %.lcssa, <4 x i32> %vec.phi.lcssa - %25 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %24) + %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24) %sunkaddr = mul i32 %i.064.us, 4 %26 = bitcast i32* %e to i8* %sunkaddr17 = getelementptr inbounds i8, i8* %26, i32 %sunkaddr @@ -141,7 +141,7 @@ } declare dso_local arm_aapcs_vfpcc signext i16 @crc16(...) local_unnamed_addr #0 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 declare void @llvm.set.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -69,7 +69,7 @@ middle.block: ; preds = %vector.body %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi - %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7) + %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -145,7 +145,7 @@ middle.block: ; preds = %vector.body %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi - %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7) + %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -221,7 +221,7 @@ middle.block: ; preds = %vector.body %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi - %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7) + %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -297,7 +297,7 @@ middle.block: ; preds = %vector.body %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi - %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7) + %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -371,7 +371,7 @@ middle.block: ; preds = %vector.body %6 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> %vec.phi - %7 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %6) + %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -1273,6 +1273,6 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -51,7 +51,7 @@ ; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) ; CHECK-NEXT: store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4 ; CHECK-NEXT: [[INC10_US]] = add nuw i32 [[I_025_US]], 1 ; CHECK-NEXT: [[EXITCOND27:%.*]] = icmp eq i32 [[INC10_US]], [[N]] @@ -112,7 +112,7 @@ middle.block: ; preds = %vector.body %tmp17 = select <4 x i1> %tmp7, <4 x i32> %tmp14, <4 x i32> %vec.phi - %tmp18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp17) + %tmp18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp17) store i32 %tmp18, i32* %arrayidx8.us, align 4 %inc10.us = add nuw i32 %i.025.us, 1 %exitcond27 = icmp eq i32 %inc10.us, %N @@ -170,7 +170,7 @@ ; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) ; CHECK-NEXT: store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[INC9_US]] = add nuw i32 [[I_024_US]], 1 ; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[INC9_US]], [[N]] @@ -229,7 +229,7 @@ middle.block: ; preds = %vector.body %tmp15 = select <4 x i1> %tmp7, <4 x i32> %tmp12, <4 x i32> %vec.phi - %tmp16 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp15) + %tmp16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp15) store i32 %tmp16, i32* %arrayidx7.us, align 4 %inc9.us = add nuw i32 %i.024.us, 1 %exitcond26 = icmp eq i32 %inc9.us, %N @@ -247,7 +247,7 @@ declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #0 ; Function Attrs: nounwind readnone willreturn -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #1 +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1 ; Function Attrs: noduplicate nounwind declare void @llvm.set.loop.iterations.i32(i32) #2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir @@ -40,7 +40,7 @@ br i1 %tmp15, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %tmp16 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp13) + %tmp16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp13) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -48,7 +48,7 @@ ret i32 %res.0.lcssa } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 declare void @llvm.set.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/non-masked-load.mir @@ -44,7 +44,7 @@ %.lcssa = phi <16 x i8> [ %13, %vector.body ] %16 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %7) %17 = select <16 x i1> %16, <16 x i8> %.lcssa, <16 x i8> %vec.phi.lcssa - %18 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %17) + %18 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %17) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -53,7 +53,7 @@ } declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #1 - declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) #2 + declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #2 declare void @llvm.set.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <16 x i1> @llvm.arm.mve.vctp8(i32) #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout.mir @@ -36,7 +36,7 @@ br i1 %cmp, label %for.body, label %middle.block middle.block: ; preds = %for.body - %reduce = tail call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %acc.next) + %reduce = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %acc.next) ret i16 %reduce for.cond.cleanup: ; preds = %entry @@ -47,7 +47,7 @@ declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) #2 declare i1 @llvm.test.set.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 - declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) #4 + declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #4 declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1 ... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir @@ -41,7 +41,7 @@ br i1 %16, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14) + %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -88,7 +88,7 @@ br i1 %16, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14) + %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -135,7 +135,7 @@ br i1 %16, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14) + %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -182,7 +182,7 @@ br i1 %16, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14) + %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -228,7 +228,7 @@ br i1 %14, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12) + %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %12) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -274,7 +274,7 @@ br i1 %14, label %vector.body, label %middle.block middle.block: ; preds = %vector.body - %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12) + %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %12) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -285,7 +285,7 @@ declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -45,7 +45,7 @@ %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) %i4 = add <16 x i8> %wide.masked.load, %wide.masked.load16 %i5 = select <16 x i1> %active.lane.mask, <16 x i8> %i4, <16 x i8> %vec.phi - %i6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i5) + %i6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i5) %index.next = add i32 %index, 16 %i7 = icmp eq i32 %index.next, %n.vec br i1 %i7, label %middle.block, label %vector.body @@ -123,7 +123,7 @@ middle.block: ; preds = %vector.body %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi - %i10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i9) + %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -193,7 +193,7 @@ middle.block: ; preds = %vector.body %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi - %i8 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i7) + %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -265,7 +265,7 @@ middle.block: ; preds = %vector.body %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi - %i10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i9) + %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -335,7 +335,7 @@ middle.block: ; preds = %vector.body %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi - %i8 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i7) + %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -407,7 +407,7 @@ middle.block: ; preds = %vector.body %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi - %i10 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i9) + %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -504,7 +504,7 @@ middle.block: ; preds = %vector.body %i9 = select <4 x i1> %active.lane.mask, <4 x i32> %i7, <4 x i32> %vec.phi - %i10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %i9) + %i10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i9) br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph47 vector.ph47: ; preds = %middle.block @@ -534,7 +534,7 @@ middle.block44: ; preds = %vector.body46 %i21 = select <4 x i1> %active.lane.mask61, <4 x i32> %i19, <4 x i32> %vec.phi60 - %i22 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %i21) + %i22 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i21) br label %for.cond.cleanup7 for.cond.cleanup7: ; preds = %middle.block44, %middle.block, %entry @@ -620,9 +620,9 @@ middle.block: ; preds = %vector.body %i11 = select <8 x i1> %active.lane.mask, <8 x i16> %i8, <8 x i16> %vec.phi - %i12 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i11) + %i12 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i11) %i13 = select <8 x i1> %active.lane.mask, <8 x i16> %i9, <8 x i16> %vec.phi.1 - %i14 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %i13) + %i14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i13) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -747,7 +747,7 @@ middle.block: ; preds = %vector.body %10 = select <4 x i1> %active.lane.mask, <4 x i32> %8, <4 x i32> %vec.phi - %11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %10) + %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10) br label %for.end for.end: ; preds = %middle.block, %lor.end @@ -758,10 +758,10 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/skip-debug.mir @@ -46,7 +46,7 @@ %.lcssa = phi <4 x i32> [ %15, %vector.body ], !dbg !38 %18 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %9), !dbg !34 %19 = select <4 x i1> %18, <4 x i32> %.lcssa, <4 x i32> %vec.phi.lcssa, !dbg !38 - %20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %19), !dbg !32 + %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19), !dbg !32 br label %for.cond.cleanup, !dbg !42 for.cond.cleanup: ; preds = %middle.block, %entry @@ -58,7 +58,7 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll @@ -258,7 +258,7 @@ middle.block: ; preds = %vector.body %19 = select <4 x i1> %active.lane.mask, <4 x i32> %16, <4 x i32> %vec.phi - %20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %19) + %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19) br label %for.end for.end: ; preds = %middle.block, %for.body @@ -282,6 +282,6 @@ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll @@ -74,14 +74,14 @@ br i1 %8, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %9 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %7) - %10 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %5) + %9 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %7) + %10 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %5) store i32 %10, i32* %minp, align 4 ret i32 %9 } declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2 -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) #3 -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) #3 +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) #3 +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) #3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredicated-max.mir @@ -26,7 +26,7 @@ %tmp8 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %tmp7) %tmp9 = sub i32 %tmp7, 8 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef) - %min = tail call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %wide.masked.load) + %min = tail call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %wide.masked.load) store i16 %min, i16* %lsr.iv.2 %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 %scevgep.2 = getelementptr i16, i16* %lsr.iv.2, i32 1 @@ -43,7 +43,7 @@ declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) - declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) + declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) ... --- diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vaddv.mir @@ -26,7 +26,7 @@ %tmp9 = sub i32 %tmp7, 4 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32> - %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp10) + %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp10) store i32 %tmp11, i32* %store.addr %store.next = getelementptr i32, i32* %store.addr, i32 1 %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 @@ -64,7 +64,7 @@ %tmp9 = sub i32 %tmp7, 8 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef) %sext = sext <8 x i16> %wide.masked.load to <8 x i32> - %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %sext) + %tmp11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %sext) store i32 %tmp11, i32* %store.addr %store.next = getelementptr i32, i32* %store.addr, i32 1 %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 @@ -102,7 +102,7 @@ %tmp9 = sub i32 %tmp7, 16 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef) %sext = sext <16 x i8> %wide.masked.load to <16 x i32> - %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %sext) + %tmp11 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %sext) store i32 %tmp11, i32* %store.addr %store.next = getelementptr i32, i32* %store.addr, i32 1 %scevgep = getelementptr i8, i8* %lsr.iv, i32 16 @@ -140,7 +140,7 @@ %tmp9 = sub i32 %tmp7, 4 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32> - %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp10) + %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp10) %acc.next = add i32 %tmp11, %acc %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) @@ -179,7 +179,7 @@ %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32> %not = xor <4 x i32> %tmp10, - %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not) + %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not) store i32 %tmp11, i32* %store.addr %store.next = getelementptr i32, i32* %store.addr, i32 1 %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 @@ -218,7 +218,7 @@ %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) %tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32> %not = xor <4 x i32> %tmp10, - %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not) + %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not) %acc.next = add i32 %tmp11, %acc %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) @@ -257,7 +257,7 @@ %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) %tmp10 = zext <4 x i16> %wide.masked.load to <4 x i32> %not = xor <4 x i32> %tmp10, - %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not) + %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not) store i32 %tmp11, i32* %store.addr %store.next = getelementptr i32, i32* %store.addr, i32 1 %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 @@ -296,7 +296,7 @@ %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef) %tmp10 = zext <4 x i16> %wide.masked.load to <4 x i32> %not = xor <4 x i32> %tmp10, - %tmp11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %not) + %tmp11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %not) %acc.next = add i32 %tmp11, %acc %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1) @@ -335,7 +335,7 @@ %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %lsr.iv17, i32 1, <8 x i1> %tmp8, <8 x i8> undef) %sext.wide = sext <8 x i8> %wide.masked.load to <8 x i16> %sub = sub <8 x i16> %sext.wide, %pass - %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub) + %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub) %sext.reduce = sext i16 %reduce to i32 store i32 %sext.reduce, i32* %store.addr %store.next = getelementptr i32, i32* %store.addr, i32 1 @@ -375,7 +375,7 @@ %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %lsr.iv17, i32 1, <8 x i1> %tmp8, <8 x i8> undef) %sext.wide = sext <8 x i8> %wide.masked.load to <8 x i16> %sub = sub <8 x i16> %sext.wide, %pass - %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub) + %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub) %sext.reduce = sext i16 %reduce to i32 %acc.next = add i32 %sext.reduce, %acc %scevgep = getelementptr i8, i8* %lsr.iv, i32 8 @@ -414,7 +414,7 @@ %tmp9 = sub i32 %tmp7, 8 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef) %sub = sub <8 x i16> %wide.masked.load, %pass - %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub) + %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub) %zext.reduce = zext i16 %reduce to i32 store i32 %zext.reduce, i32* %store.addr %store.next = getelementptr i32, i32* %store.addr, i32 1 @@ -453,7 +453,7 @@ %tmp9 = sub i32 %tmp7, 8 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv17, i32 2, <8 x i1> %tmp8, <8 x i16> undef) %sub = sub <8 x i16> %wide.masked.load, %pass - %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %sub) + %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sub) %zext.reduce = zext i16 %reduce to i32 %acc.next = add i32 %zext.reduce, %acc %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 @@ -492,7 +492,7 @@ %tmp9 = sub i32 %tmp7, 16 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef) %xor = xor <16 x i8> %wide.masked.load, %pass - %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor) + %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor) %sext.reduce = sext i8 %reduce to i32 store i32 %sext.reduce, i32* %store.addr %store.next = getelementptr i32, i32* %store.addr, i32 1 @@ -531,7 +531,7 @@ %tmp9 = sub i32 %tmp7, 16 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef) %xor = xor <16 x i8> %wide.masked.load, %pass - %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor) + %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor) %sext.reduce = sext i8 %reduce to i32 %acc.next = add i32 %sext.reduce, %acc %scevgep = getelementptr i8, i8* %lsr.iv, i32 16 @@ -570,7 +570,7 @@ %tmp9 = sub i32 %tmp7, 16 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef) %xor = xor <16 x i8> %wide.masked.load, %pass - %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor) + %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor) %zext.reduce = zext i8 %reduce to i32 store i32 %zext.reduce, i32* %store.addr %store.next = getelementptr i32, i32* %store.addr, i32 1 @@ -609,7 +609,7 @@ %tmp9 = sub i32 %tmp7, 16 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %lsr.iv17, i32 1, <16 x i1> %tmp8, <16 x i8> undef) %xor = xor <16 x i8> %wide.masked.load, %pass - %reduce = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %xor) + %reduce = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %xor) %zext.reduce = zext i8 %reduce to i32 %acc.next = add i32 %zext.reduce, %acc %scevgep = getelementptr i8, i8* %lsr.iv, i32 16 @@ -652,7 +652,7 @@ %tmp4 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp3, i32 2, <4 x i1> %tmp, <4 x i16> zeroinitializer) %zext.wide.2 = zext <4 x i16> %tmp4 to <4 x i32> %or = or <4 x i32> %zext.wide.1, %zext.wide.2 - %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %or) + %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %or) %acc.next = add i32 %reduce, %acc %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 4 %add.ptr4 = getelementptr inbounds i16, i16* %y.addr.025, i32 4 @@ -693,7 +693,7 @@ %tmp2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp1, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer) %tmp4 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp3, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer) %or = or <8 x i16> %tmp2, %tmp4 - %reduce = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %or) + %reduce = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %or) %zext.reduce = zext i16 %reduce to i32 %acc.next = add i32 %zext.reduce, %acc %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 8 @@ -737,7 +737,7 @@ %tmp5 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp4, i32 0, i32 1) %tmp6 = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp4, i32 0, i32 0) %mul = add <4 x i32> %tmp5, %tmp6 - %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %mul) + %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul) %acc.next = add i32 %reduce, %acc %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 8 %add.ptr4 = getelementptr inbounds i16, i16* %y.addr.025, i32 8 @@ -778,7 +778,7 @@ %tmp2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp1, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer) %tmp4 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp3, i32 2, <8 x i1> %tmp, <8 x i16> zeroinitializer) %mul = tail call <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp4, i32 0, i32 1) - %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %mul) + %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %mul) %acc.next = add i32 %reduce, %acc %add.ptr = getelementptr inbounds i16, i16* %x.addr.026, i32 8 %add.ptr4 = getelementptr inbounds i16, i16* %y.addr.025, i32 8 @@ -798,11 +798,11 @@ declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) - declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) - declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) - declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) - declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) + declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) + declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) + declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) + declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i32> @llvm.arm.mve.vmull.v4i32.v8i16(<8 x i16>, <8 x i16>, i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -214,7 +214,7 @@ middle.block: ; preds = %vector.body %i19 = select <4 x i1> %active.lane.mask, <4 x i32> %i16, <4 x i32> %vec.phi - %i20 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %i19) + %i20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i19) br label %for.end for.end: ; preds = %middle.block, %for.body @@ -235,6 +235,6 @@ declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare void @llvm.set.loop.iterations.i32(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir @@ -47,7 +47,7 @@ %15 = add i32 %8, 4 %16 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %15) %17 = select <4 x i1> %16, <4 x i32> %12, <4 x i32> %vec.phi - %18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %17) + %18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %17) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -55,7 +55,7 @@ ret i32 %res.0.lcssa } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.arm.mve.vctp32(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp16-reduce.mir @@ -46,7 +46,7 @@ %.lcssa = phi <8 x i16> [ %15, %vector.body ] %18 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7) %19 = select <8 x i1> %18, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa - %20 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %19) + %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -54,7 +54,7 @@ ret i16 %a.0.lcssa } declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) - declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) + declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -70,7 +70,7 @@ middle.block: ; preds = %vector.body %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi - %9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8) + %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -141,7 +141,7 @@ middle.block: ; preds = %vector.body %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi - %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5) + %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -212,7 +212,7 @@ middle.block: ; preds = %vector.body %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi - %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5) + %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -459,7 +459,7 @@ declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -16,7 +16,7 @@ ; CHECK: middle.block: ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]], -; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) +; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { entry: @@ -64,7 +64,7 @@ middle.block: ; preds = %vector.body %12 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %vec.phi - %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12) + %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %12) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -73,7 +73,7 @@ } declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir @@ -118,7 +118,7 @@ middle.block: ; preds = %vector.body %8 = call <4 x i1> @llvm.arm.vctp32(i32 %5) %tmp8 = select <4 x i1> %8, <4 x i32> %tmp6, <4 x i32> %vec.phi - %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8) + %tmp9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp8) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -134,7 +134,7 @@ declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare <16 x i1> @llvm.arm.vctp8(i32) declare void @llvm.stackprotector(i8*, i8**) declare <8 x i1> @llvm.arm.vctp16(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-liveout-lsr-shift.mir @@ -46,7 +46,7 @@ %.lcssa = phi <8 x i16> [ %15, %vector.body ] %18 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %7) %19 = select <8 x i1> %18, <8 x i16> %.lcssa, <8 x i16> %vec.phi.lcssa - %20 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %19) + %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %19) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -54,7 +54,7 @@ ret i16 %a.0.lcssa } declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) - declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) + declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) declare void @llvm.set.loop.iterations.i32(i32) declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir @@ -52,7 +52,7 @@ %n.splat = shufflevector <4 x i32> %insert.n, <4 x i32> undef, <4 x i32> zeroinitializer %tmp16 = icmp ult <4 x i32> %idx.splat, %n.splat %tmp17 = select <4 x i1> %tmp16, <4 x i32> %tmp13, <4 x i32> %vec.phi - %tmp18 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp17) + %tmp18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp17) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -60,7 +60,7 @@ ret i32 %res.0.lcssa } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 declare void @llvm.set.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir @@ -45,7 +45,7 @@ middle.block: ; preds = %vector.body %15 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %8) %16 = select <4 x i1> %15, <4 x i32> %12, <4 x i32> %vec.phi - %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %16) + %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %16) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry @@ -53,7 +53,7 @@ ret i32 %res.0.lcssa } declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #1 - declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #2 + declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #2 declare void @llvm.set.loop.iterations.i32(i32) #3 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -572,7 +572,7 @@ br i1 %10, label %middle.block, label %vector.body, !llvm.loop !7 middle.block: ; preds = %vector.body - %11 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9) + %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) ;for.cond8.for.cond.cleanup10_crit_edge.us.us: ; preds = %for.body11.us.us, %middle.block %add19.us.us = add i32 %j.051.us.us, %mul18.us %arrayidx20.us.us = getelementptr inbounds i32, i32* %C, i32 %add19.us.us @@ -803,7 +803,7 @@ br i1 %12, label %middle.block, label %vector.body, !llvm.loop !7 middle.block: ; preds = %vector.body - %13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %11) + %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %11) br i1 %cmp.n, label %for.cond5.for.cond.cleanup7_crit_edge.us.us, label %for.body8.us.us.preheader for.cond5.for.cond.cleanup7_crit_edge.us.us: ; preds = %for.body8.us.us, %middle.block @@ -1065,7 +1065,7 @@ %wide.masked.gather75 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %tmp85, i32 1, <4 x i1> , <4 x i8> undef) %tmp86 = sext <4 x i8> %wide.masked.gather75 to <4 x i32> %tmp87 = mul nsw <4 x i32> %tmp84, %tmp86 - %tmp88 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp87) + %tmp88 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %tmp87) %tmp89 = add i32 %tmp88, %vec.phi %index.next = add i32 %index, 4 %vec.ind.next = add <4 x i32> %vec.ind, @@ -1091,7 +1091,7 @@ declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32 immarg, <4 x i1>, <4 x i8>) #3 -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare void @llvm.memset.p0i8.i32(i8* align 2, i8, i32, i1) declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -62,7 +62,7 @@ br i1 %8, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi - %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9) + %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) store i32 %10, i32* %arrayidx.us.us, align 4 %inc21.us.us = add nuw i32 4, 1 %exitcond81.not = icmp eq i32 %inc21.us.us, %n @@ -139,7 +139,7 @@ br i1 %8, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi - %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9) + %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) store i32 %10, i32* %arrayidx.us.us, align 4 %inc21.us.us = add nuw i32 4, 1 %exitcond81.not = icmp eq i32 %inc21.us.us, %n @@ -210,7 +210,7 @@ br i1 %8, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %9 = select <4 x i1> %active.lane.mask, <4 x i32> %7, <4 x i32> %vec.phi - %10 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %9) + %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) store i32 %10, i32* %arrayidx.us.us, align 4 %inc21.us.us = add nuw i32 4, 1 %exitcond81.not = icmp eq i32 %inc21.us.us, %n @@ -440,7 +440,7 @@ ret void } -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -1390,7 +1390,7 @@ declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) -declare i32 @llvm.experimental.vector.reduce.add.v16i8(<16 x i32> %ext4) +declare i32 @llvm.vector.reduce.add.v16i8(<16 x i32> %ext4) declare i32 @llvm.arm.mve.vmldava.v8i16(i32, i32, i32, i32, <8 x i16>, <8 x i16>) declare i32 @llvm.arm.mve.vmldava.predicated.v16i8.v16i1(i32, i32, i32, i32, <16 x i8>, <16 x i8>, <16 x i1>) declare i32 @llvm.arm.mve.vmldava.predicated.v8i16.v8i1(i32, i32, i32, i32, <8 x i16>, <8 x i16>, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll --- a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s -declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>) -declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>) -declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>) -declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>) +declare i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64>) +declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>) +declare i16 @llvm.vector.reduce.add.i16.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>) +declare i8 @llvm.vector.reduce.add.i8.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8>) define arm_aapcs_vfpcc i64 @vaddv_v2i64_i64(<2 x i64> %s1) { ; CHECK-LABEL: vaddv_v2i64_i64: @@ -20,7 +20,7 @@ ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: bx lr entry: - %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %s1) + %r = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1) ret i64 %r } @@ -30,7 +30,7 @@ ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: bx lr entry: - %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %s1) + %r = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %s1) ret i32 %r } @@ -41,7 +41,7 @@ ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: bx lr entry: - %r = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %s1) + %r = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1) ret i32 %r } @@ -51,7 +51,7 @@ ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: bx lr entry: - %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.add.i16.v8i16(<8 x i16> %s1) ret i16 %r } @@ -62,7 +62,7 @@ ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: bx lr entry: - %r = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %s1) + %r = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1) ret i16 %r } @@ -72,7 +72,7 @@ ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: bx lr entry: - %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.add.i8.v16i8(<16 x i8> %s1) ret i8 %r } @@ -83,7 +83,7 @@ ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: bx lr entry: - %r = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %s1) + %r = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1) ret i8 %r } @@ -102,7 +102,7 @@ ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: - %t = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %s1) + %t = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1) %r = add i64 %t, %x ret i64 %r } @@ -113,7 +113,7 @@ ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr entry: - %t = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %s1) + %t = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %s1) %r = add i32 %t, %x ret i32 %r } @@ -125,7 +125,7 @@ ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr entry: - %t = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %s1) + %t = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1) %r = add i32 %t, %x ret i32 %r } @@ -136,7 +136,7 @@ ; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: - %t = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %s1) + %t = call i16 @llvm.vector.reduce.add.i16.v8i16(<8 x i16> %s1) %r = add i16 %t, %x ret i16 %r } @@ -148,7 +148,7 @@ ; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: - %t = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %s1) + %t = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1) %r = add i16 %t, %x ret i16 %r } @@ -159,7 +159,7 @@ ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: bx lr entry: - %t = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %s1) + %t = call i8 @llvm.vector.reduce.add.i8.v16i8(<16 x i8> %s1) %r = add i8 %t, %x ret i8 %r } @@ -171,7 +171,7 @@ ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: bx lr entry: - %t = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> %s1) + %t = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1) %r = add i8 %t, %x ret i8 %r } diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -7,7 +7,7 @@ ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) ret i32 %z } @@ -18,7 +18,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <4 x i32> %x to <4 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) ret i64 %z } @@ -29,7 +29,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i32> %x to <4 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) ret i64 %z } @@ -47,7 +47,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) ret i64 %z } @@ -65,7 +65,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i32> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) ret i64 %z } @@ -76,7 +76,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) ret i32 %z } @@ -87,7 +87,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i16> %x to <8 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) ret i32 %z } @@ -99,7 +99,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <4 x i16> %x to <4 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) ret i32 %z } @@ -111,7 +111,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i16> %x to <4 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) ret i32 %z } @@ -122,7 +122,7 @@ ; CHECK-NEXT: uxth r0, r0 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) ret i16 %z } @@ -175,7 +175,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) ret i64 %z } @@ -242,7 +242,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i16> %x to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) ret i64 %z } @@ -258,7 +258,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) ret i64 %z } @@ -278,7 +278,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) ret i64 %z } @@ -289,7 +289,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) ret i32 %z } @@ -300,7 +300,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) ret i32 %z } @@ -313,7 +313,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <4 x i8> %x to <4 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) ret i32 %z } @@ -326,7 +326,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i8> %x to <4 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) ret i32 %z } @@ -338,7 +338,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i16> - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) ret i16 %z } @@ -350,7 +350,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i16> - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) ret i16 %z } @@ -363,7 +363,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i8> %x to <8 x i16> - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) ret i16 %z } @@ -376,7 +376,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i8> %x to <8 x i16> - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) ret i16 %z } @@ -387,7 +387,7 @@ ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) ret i8 %z } @@ -492,7 +492,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) ret i64 %z } @@ -627,7 +627,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) ret i64 %z } @@ -643,7 +643,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) ret i64 %z } @@ -663,7 +663,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) ret i64 %z } @@ -678,7 +678,7 @@ ; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) ret i64 %z } @@ -688,7 +688,7 @@ ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) %r = add i32 %z, %a ret i32 %r } @@ -700,7 +700,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <4 x i32> %x to <4 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -712,7 +712,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i32> %x to <4 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -735,7 +735,7 @@ ; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <2 x i32> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -756,7 +756,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i32> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -768,7 +768,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) %r = add i32 %z, %a ret i32 %r } @@ -780,7 +780,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i16> %x to <8 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) %r = add i32 %z, %a ret i32 %r } @@ -793,7 +793,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <4 x i16> %x to <4 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) %r = add i32 %z, %a ret i32 %r } @@ -806,7 +806,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i16> %x to <4 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) %r = add i32 %z, %a ret i32 %r } @@ -818,7 +818,7 @@ ; CHECK-NEXT: uxth r0, r0 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) %r = add i16 %z, %a ret i16 %r } @@ -876,7 +876,7 @@ ; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <8 x i16> %x to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -948,7 +948,7 @@ ; CHECK-NEXT: pop {r4, pc} entry: %xx = sext <8 x i16> %x to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -967,7 +967,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -990,7 +990,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -1002,7 +1002,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) %r = add i32 %z, %a ret i32 %r } @@ -1014,7 +1014,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) %r = add i32 %z, %a ret i32 %r } @@ -1028,7 +1028,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <4 x i8> %x to <4 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) %r = add i32 %z, %a ret i32 %r } @@ -1042,7 +1042,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i8> %x to <4 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %xx) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) %r = add i32 %z, %a ret i32 %r } @@ -1055,7 +1055,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i16> - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) %r = add i16 %z, %a ret i16 %r } @@ -1068,7 +1068,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i16> - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %xx) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) %r = add i16 %z, %a ret i16 %r } @@ -1082,7 +1082,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i8> %x to <8 x i16> - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) %r = add i16 %z, %a ret i16 %r } @@ -1096,7 +1096,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i8> %x to <8 x i16> - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %xx) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) %r = add i16 %z, %a ret i16 %r } @@ -1108,7 +1108,7 @@ ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) %r = add i8 %z, %a ret i8 %r } @@ -1218,7 +1218,7 @@ ; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -1358,7 +1358,7 @@ ; CHECK-NEXT: pop {r4, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -1377,7 +1377,7 @@ ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -1400,7 +1400,7 @@ ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) %r = add i64 %z, %a ret i64 %r } @@ -1420,18 +1420,18 @@ ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) %r = add i64 %z, %a ret i64 %r } -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) -declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -10,7 +10,7 @@ entry: %c = icmp eq <4 x i32> %b, zeroinitializer %s = select <4 x i1> %c, <4 x i32> %x, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -24,7 +24,7 @@ %c = icmp eq <4 x i32> %b, zeroinitializer %xx = zext <4 x i32> %x to <4 x i64> %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) ret i64 %z } @@ -38,7 +38,7 @@ %c = icmp eq <4 x i32> %b, zeroinitializer %xx = sext <4 x i32> %x to <4 x i64> %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) ret i64 %z } @@ -73,7 +73,7 @@ %c = icmp eq <2 x i32> %b, zeroinitializer %xx = zext <2 x i32> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -114,7 +114,7 @@ %c = icmp eq <2 x i32> %b, zeroinitializer %xx = sext <2 x i32> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -128,7 +128,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %xx = zext <8 x i16> %x to <8 x i32> %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) ret i32 %z } @@ -142,7 +142,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %xx = sext <8 x i16> %x to <8 x i32> %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) ret i32 %z } @@ -158,7 +158,7 @@ %c = icmp eq <4 x i16> %b, zeroinitializer %xx = zext <4 x i16> %x to <4 x i32> %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -174,7 +174,7 @@ %c = icmp eq <4 x i16> %b, zeroinitializer %xx = sext <4 x i16> %x to <4 x i32> %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -188,7 +188,7 @@ entry: %c = icmp eq <8 x i16> %b, zeroinitializer %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) ret i16 %z } @@ -314,7 +314,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %xx = zext <8 x i16> %x to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) ret i64 %z } @@ -456,7 +456,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %xx = sext <8 x i16> %x to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) ret i64 %z } @@ -492,7 +492,7 @@ %c = icmp eq <2 x i16> %b, zeroinitializer %xx = zext <2 x i16> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -537,7 +537,7 @@ %c = icmp eq <2 x i16> %b, zeroinitializer %xx = sext <2 x i16> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -551,7 +551,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) ret i32 %z } @@ -565,7 +565,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) ret i32 %z } @@ -582,7 +582,7 @@ %c = icmp eq <4 x i8> %b, zeroinitializer %xx = zext <4 x i8> %x to <4 x i32> %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -600,7 +600,7 @@ %c = icmp eq <4 x i8> %b, zeroinitializer %xx = sext <4 x i8> %x to <4 x i32> %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -615,7 +615,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i16> %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) ret i16 %z } @@ -630,7 +630,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i16> %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) ret i16 %z } @@ -647,7 +647,7 @@ %c = icmp eq <8 x i8> %b, zeroinitializer %xx = zext <8 x i8> %x to <8 x i16> %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) ret i16 %z } @@ -664,7 +664,7 @@ %c = icmp eq <8 x i8> %b, zeroinitializer %xx = sext <8 x i8> %x to <8 x i16> %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) ret i16 %z } @@ -678,7 +678,7 @@ entry: %c = icmp eq <16 x i8> %b, zeroinitializer %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> zeroinitializer - %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s) + %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s) ret i8 %z } @@ -948,7 +948,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) ret i64 %z } @@ -1257,7 +1257,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i64> %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) ret i64 %z } @@ -1293,7 +1293,7 @@ %c = icmp eq <2 x i8> %b, zeroinitializer %xx = zext <2 x i8> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -1338,7 +1338,7 @@ %c = icmp eq <2 x i8> %b, zeroinitializer %xx = sext <2 x i8> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -1372,7 +1372,7 @@ entry: %c = icmp eq <2 x i64> %b, zeroinitializer %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -1385,7 +1385,7 @@ entry: %c = icmp eq <4 x i32> %b, zeroinitializer %s = select <4 x i1> %c, <4 x i32> %x, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1400,7 +1400,7 @@ %c = icmp eq <4 x i32> %b, zeroinitializer %xx = zext <4 x i32> %x to <4 x i64> %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1415,7 +1415,7 @@ %c = icmp eq <4 x i32> %b, zeroinitializer %xx = sext <4 x i32> %x to <4 x i64> %s = select <4 x i1> %c, <4 x i64> %xx, <4 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1455,7 +1455,7 @@ %c = icmp eq <2 x i32> %b, zeroinitializer %xx = zext <2 x i32> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1501,7 +1501,7 @@ %c = icmp eq <2 x i32> %b, zeroinitializer %xx = sext <2 x i32> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1516,7 +1516,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %xx = zext <8 x i16> %x to <8 x i32> %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1531,7 +1531,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %xx = sext <8 x i16> %x to <8 x i32> %s = select <8 x i1> %c, <8 x i32> %xx, <8 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1548,7 +1548,7 @@ %c = icmp eq <4 x i16> %b, zeroinitializer %xx = zext <4 x i16> %x to <4 x i32> %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1565,7 +1565,7 @@ %c = icmp eq <4 x i16> %b, zeroinitializer %xx = sext <4 x i16> %x to <4 x i32> %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1580,7 +1580,7 @@ entry: %c = icmp eq <8 x i16> %b, zeroinitializer %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -1711,7 +1711,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %xx = zext <8 x i16> %x to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1858,7 +1858,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %xx = sext <8 x i16> %x to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %xx, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1897,7 +1897,7 @@ %c = icmp eq <2 x i16> %b, zeroinitializer %xx = zext <2 x i16> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1947,7 +1947,7 @@ %c = icmp eq <2 x i16> %b, zeroinitializer %xx = sext <2 x i16> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1962,7 +1962,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1977,7 +1977,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %xx, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1995,7 +1995,7 @@ %c = icmp eq <4 x i8> %b, zeroinitializer %xx = zext <4 x i8> %x to <4 x i32> %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -2014,7 +2014,7 @@ %c = icmp eq <4 x i8> %b, zeroinitializer %xx = sext <4 x i8> %x to <4 x i32> %s = select <4 x i1> %c, <4 x i32> %xx, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -2030,7 +2030,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i16> %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -2046,7 +2046,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i16> %s = select <16 x i1> %c, <16 x i16> %xx, <16 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -2064,7 +2064,7 @@ %c = icmp eq <8 x i8> %b, zeroinitializer %xx = zext <8 x i8> %x to <8 x i16> %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -2082,7 +2082,7 @@ %c = icmp eq <8 x i8> %b, zeroinitializer %xx = sext <8 x i8> %x to <8 x i16> %s = select <8 x i1> %c, <8 x i16> %xx, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -2097,7 +2097,7 @@ entry: %c = icmp eq <16 x i8> %b, zeroinitializer %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> zeroinitializer - %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s) + %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s) %r = add i8 %z, %a ret i8 %r } @@ -2372,7 +2372,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -2686,7 +2686,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %xx = sext <16 x i8> %x to <16 x i64> %s = select <16 x i1> %c, <16 x i64> %xx, <16 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -2725,7 +2725,7 @@ %c = icmp eq <2 x i8> %b, zeroinitializer %xx = zext <2 x i8> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -2775,7 +2775,7 @@ %c = icmp eq <2 x i8> %b, zeroinitializer %xx = sext <2 x i8> %x to <2 x i64> %s = select <2 x i1> %c, <2 x i64> %xx, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -2814,18 +2814,18 @@ entry: %c = icmp eq <2 x i64> %b, zeroinitializer %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) -declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %x) + %z = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %x) ret i32 %z } @@ -25,7 +25,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %x) ret i32 %z } @@ -42,7 +42,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %x) + %z = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %x) ret i32 %z } @@ -58,7 +58,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %x) + %z = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %x) ret i16 %z } @@ -76,7 +76,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %x) ret i16 %z } @@ -95,7 +95,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %x) + %z = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %x) ret i16 %z } @@ -113,7 +113,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %x) + %z = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %x) ret i8 %z } @@ -133,7 +133,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %x) ret i8 %z } @@ -154,7 +154,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %x) + %z = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %x) ret i8 %z } @@ -163,7 +163,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %x) + %z = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %x) ret i64 %z } @@ -178,7 +178,7 @@ ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x) ret i64 %z } @@ -194,7 +194,7 @@ ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %x) + %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x) ret i64 %z } @@ -207,7 +207,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %x) + %z = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %x) %r = and i32 %y, %z ret i32 %r } @@ -225,7 +225,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %x) %r = and i32 %y, %z ret i32 %r } @@ -244,7 +244,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %x) + %z = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %x) %r = and i32 %y, %z ret i32 %r } @@ -262,7 +262,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %x) + %z = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %x) %r = and i16 %y, %z ret i16 %r } @@ -282,7 +282,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %x) %r = and i16 %y, %z ret i16 %r } @@ -303,7 +303,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %x) + %z = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %x) %r = and i16 %y, %z ret i16 %r } @@ -323,7 +323,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %x) + %z = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %x) %r = and i8 %y, %z ret i8 %r } @@ -345,7 +345,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %x) %r = and i8 %y, %z ret i8 %r } @@ -368,7 +368,7 @@ ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %x) + %z = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %x) %r = and i8 %y, %z ret i8 %r } @@ -380,7 +380,7 @@ ; CHECK-NEXT: ands r1, r3 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> %x) + %z = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %x) %r = and i64 %y, %z ret i64 %r } @@ -398,7 +398,7 @@ ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x) %r = and i64 %y, %z ret i64 %r } @@ -417,7 +417,7 @@ ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %x) + %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x) %r = and i64 %y, %z ret i64 %r } @@ -430,7 +430,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %x) + %z = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %x) ret i32 %z } @@ -446,7 +446,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %x) ret i32 %z } @@ -463,7 +463,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %x) + %z = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %x) ret i32 %z } @@ -479,7 +479,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %x) + %z = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %x) ret i16 %z } @@ -497,7 +497,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %x) ret i16 %z } @@ -516,7 +516,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %x) + %z = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %x) ret i16 %z } @@ -534,7 +534,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %x) + %z = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %x) ret i8 %z } @@ -554,7 +554,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %x) ret i8 %z } @@ -575,7 +575,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %x) + %z = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %x) ret i8 %z } @@ -584,7 +584,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> %x) + %z = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %x) ret i64 %z } @@ -599,7 +599,7 @@ ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x) ret i64 %z } @@ -615,7 +615,7 @@ ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %x) + %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x) ret i64 %z } @@ -628,7 +628,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %x) + %z = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %x) %r = or i32 %y, %z ret i32 %r } @@ -646,7 +646,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %x) %r = or i32 %y, %z ret i32 %r } @@ -665,7 +665,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %x) + %z = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %x) %r = or i32 %y, %z ret i32 %r } @@ -683,7 +683,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %x) + %z = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %x) %r = or i16 %y, %z ret i16 %r } @@ -703,7 +703,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %x) %r = or i16 %y, %z ret i16 %r } @@ -724,7 +724,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %x) + %z = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %x) %r = or i16 %y, %z ret i16 %r } @@ -744,7 +744,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %x) + %z = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %x) %r = or i8 %y, %z ret i8 %r } @@ -766,7 +766,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %x) %r = or i8 %y, %z ret i8 %r } @@ -789,7 +789,7 @@ ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %x) + %z = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %x) %r = or i8 %y, %z ret i8 %r } @@ -801,7 +801,7 @@ ; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> %x) + %z = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %x) %r = or i64 %y, %z ret i64 %r } @@ -819,7 +819,7 @@ ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x) %r = or i64 %y, %z ret i64 %r } @@ -838,7 +838,7 @@ ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %x) + %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x) %r = or i64 %y, %z ret i64 %r } @@ -851,7 +851,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %x) + %z = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %x) ret i32 %z } @@ -867,7 +867,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %x) ret i32 %z } @@ -884,7 +884,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %x) + %z = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %x) ret i32 %z } @@ -900,7 +900,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %x) + %z = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %x) ret i16 %z } @@ -918,7 +918,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %x) ret i16 %z } @@ -937,7 +937,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %x) + %z = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %x) ret i16 %z } @@ -955,7 +955,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %x) + %z = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %x) ret i8 %z } @@ -975,7 +975,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %x) ret i8 %z } @@ -996,7 +996,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %x) + %z = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %x) ret i8 %z } @@ -1005,7 +1005,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> %x) + %z = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %x) ret i64 %z } @@ -1020,7 +1020,7 @@ ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x) ret i64 %z } @@ -1036,7 +1036,7 @@ ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %x) + %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x) ret i64 %z } @@ -1049,7 +1049,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %x) + %z = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %x) %r = xor i32 %y, %z ret i32 %r } @@ -1067,7 +1067,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %x) %r = xor i32 %y, %z ret i32 %r } @@ -1086,7 +1086,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %x) + %z = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %x) %r = xor i32 %y, %z ret i32 %r } @@ -1104,7 +1104,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %x) + %z = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %x) %r = xor i16 %y, %z ret i16 %r } @@ -1124,7 +1124,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %x) %r = xor i16 %y, %z ret i16 %r } @@ -1145,7 +1145,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %x) + %z = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %x) %r = xor i16 %y, %z ret i16 %r } @@ -1165,7 +1165,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %x) + %z = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %x) %r = xor i8 %y, %z ret i8 %r } @@ -1187,7 +1187,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %x) %r = xor i8 %y, %z ret i8 %r } @@ -1210,7 +1210,7 @@ ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %x) + %z = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %x) %r = xor i8 %y, %z ret i8 %r } @@ -1222,7 +1222,7 @@ ; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> %x) + %z = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %x) %r = xor i64 %y, %z ret i64 %r } @@ -1240,7 +1240,7 @@ ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x) %r = xor i64 %y, %z ret i64 %r } @@ -1259,44 +1259,44 @@ ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %x) + %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x) %r = xor i64 %y, %z ret i64 %r } -declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>) -declare i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>) -declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>) +declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>) +declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) +declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: vadd.f32 s0, s4, s0 ; CHECK-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x) + %z = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x) ret float %z } @@ -30,7 +30,7 @@ ; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %y, <4 x float> %x) + %z = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x) ret float %z } @@ -56,7 +56,7 @@ ; CHECK-NOFP-NEXT: vadd.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %y, <8 x float> %x) + %z = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x) ret float %z } @@ -71,7 +71,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half %y, <2 x half> %x) + %z = call fast half @llvm.vector.reduce.fadd.f16.v2f16(half %y, <2 x half> %x) store half %z, half* %yy ret void } @@ -102,7 +102,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x) + %z = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x) store half %z, half* %yy ret void } @@ -139,7 +139,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half %y, <8 x half> %x) + %z = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x) store half %z, half* %yy ret void } @@ -189,7 +189,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half %y, <16 x half> %x) + %z = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x) store half %z, half* %yy ret void } @@ -200,7 +200,7 @@ ; CHECK-NEXT: vadd.f64 d0, d1, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double %y, <1 x double> %x) + %z = call fast double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x) ret double %z } @@ -211,7 +211,7 @@ ; CHECK-NEXT: vadd.f64 d0, d2, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %y, <2 x double> %x) + %z = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x) ret double %z } @@ -224,7 +224,7 @@ ; CHECK-NEXT: vadd.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %y, <4 x double> %x) + %z = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x) ret double %z } @@ -235,7 +235,7 @@ ; CHECK-NEXT: vadd.f32 s0, s4, s1 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x) + %z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x) ret float %z } @@ -248,7 +248,7 @@ ; CHECK-NEXT: vadd.f32 s0, s4, s3 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %y, <4 x float> %x) + %z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x) ret float %z } @@ -265,7 +265,7 @@ ; CHECK-NEXT: vadd.f32 s0, s0, s7 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %y, <8 x float> %x) + %z = call float @llvm.vector.reduce.fadd.f32.v8f32(float %y, <8 x float> %x) ret float %z } @@ -283,7 +283,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x) + %z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x) store half %z, half* %yy ret void } @@ -308,7 +308,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half %y, <8 x half> %x) + %z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x) store half %z, half* %yy ret void } @@ -345,7 +345,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half %y, <16 x half> %x) + %z = call half @llvm.vector.reduce.fadd.f16.v16f16(half %y, <16 x half> %x) store half %z, half* %yy ret void } @@ -356,7 +356,7 @@ ; CHECK-NEXT: vadd.f64 d0, d1, d0 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double %y, <1 x double> %x) + %z = call double @llvm.vector.reduce.fadd.f64.v1f64(double %y, <1 x double> %x) ret double %z } @@ -367,7 +367,7 @@ ; CHECK-NEXT: vadd.f64 d0, d2, d1 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %y, <2 x double> %x) + %z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x) ret double %z } @@ -380,17 +380,17 @@ ; CHECK-NEXT: vadd.f64 d0, d0, d3 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %y, <4 x double> %x) + %z = call double @llvm.vector.reduce.fadd.f64.v4f64(double %y, <4 x double> %x) ret double %z } -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v1f64(double, <1 x double>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>) -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half, <2 x half>) -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>) -declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>) +declare double @llvm.vector.reduce.fadd.f64.v1f64(double, <1 x double>) +declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) +declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) +declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>) +declare half @llvm.vector.reduce.fadd.f16.v2f16(half, <2 x half>) +declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>) +declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: vminnm.f32 s0, s0, s1 ; CHECK-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) + %z = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x) ret float %z } @@ -27,7 +27,7 @@ ; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x) + %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x) ret float %z } @@ -60,7 +60,7 @@ ; CHECK-NOFP-NEXT: vminnm.f32 s0, s2, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x) + %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x) ret float %z } @@ -83,7 +83,7 @@ ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) + %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x) ret half %z } @@ -112,7 +112,7 @@ ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) + %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x) ret half %z } @@ -170,7 +170,7 @@ ; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x) + %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x) ret half %z } @@ -179,7 +179,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x) + %z = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x) ret double %z } @@ -189,7 +189,7 @@ ; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x) + %z = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x) ret double %z } @@ -205,7 +205,7 @@ ; CHECK-NEXT: vminnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x) + %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x) ret double %z } @@ -215,7 +215,7 @@ ; CHECK-NEXT: vminnm.f32 s0, s0, s1 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) + %z = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x) ret float %z } @@ -234,7 +234,7 @@ ; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x) + %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x) ret float %z } @@ -258,7 +258,7 @@ ; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x) + %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x) ret float %z } @@ -281,7 +281,7 @@ ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) + %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x) ret half %z } @@ -310,7 +310,7 @@ ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) + %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x) ret half %z } @@ -352,7 +352,7 @@ ; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x) + %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x) ret half %z } @@ -361,7 +361,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x) + %z = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x) ret double %z } @@ -371,7 +371,7 @@ ; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x) + %z = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x) ret double %z } @@ -383,7 +383,7 @@ ; CHECK-NEXT: vminnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x) + %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x) ret double %z } @@ -394,7 +394,7 @@ ; CHECK-NEXT: vminnm.f32 s0, s4, s0 ; CHECK-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) + %z = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x) %c = fcmp fast olt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -417,7 +417,7 @@ ; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x) + %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x) %c = fcmp fast olt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -453,7 +453,7 @@ ; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x) + %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x) %c = fcmp fast olt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -485,7 +485,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) + %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x) %c = fcmp fast olt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -503,7 +503,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %x) + %z = call fast half @llvm.vector.reduce.fmin.v2f16(<2 x half> %x) %c = fcmp fast olt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -542,7 +542,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) + %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x) %c = fcmp fast olt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -610,7 +610,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x) + %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x) %c = fcmp fast olt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -623,7 +623,7 @@ ; CHECK-NEXT: vminnm.f64 d0, d1, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x) + %z = call fast double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x) %c = fcmp fast olt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -636,7 +636,7 @@ ; CHECK-NEXT: vminnm.f64 d0, d2, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x) + %z = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x) %c = fcmp fast olt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -655,7 +655,7 @@ ; CHECK-NEXT: vminnm.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x) + %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x) %c = fcmp fast olt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -670,7 +670,7 @@ ; CHECK-NEXT: vselgt.f32 s0, s4, s0 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x) + %z = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %x) %c = fcmp olt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -697,7 +697,7 @@ ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x) + %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x) %c = fcmp olt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -729,7 +729,7 @@ ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %x) + %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x) %c = fcmp olt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -765,7 +765,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x) + %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x) %c = fcmp olt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -808,7 +808,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x) + %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x) %c = fcmp olt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -864,7 +864,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half> %x) + %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x) %c = fcmp olt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -879,7 +879,7 @@ ; CHECK-NEXT: vselgt.f64 d0, d1, d0 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double> %x) + %z = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %x) %c = fcmp olt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -894,7 +894,7 @@ ; CHECK-NEXT: vselgt.f64 d0, d2, d0 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %x) + %z = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %x) %c = fcmp olt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -911,7 +911,7 @@ ; CHECK-NEXT: vselgt.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %x) + %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x) %c = fcmp olt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -923,7 +923,7 @@ ; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 ; CHECK-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) + %z = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x) ret float %z } @@ -942,7 +942,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x) + %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x) ret float %z } @@ -974,7 +974,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s2, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x) + %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x) ret float %z } @@ -997,7 +997,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) + %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x) ret half %z } @@ -1026,7 +1026,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) + %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x) ret half %z } @@ -1084,7 +1084,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x) + %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x) ret half %z } @@ -1093,7 +1093,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x) + %z = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x) ret double %z } @@ -1103,7 +1103,7 @@ ; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x) + %z = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x) ret double %z } @@ -1119,7 +1119,7 @@ ; CHECK-NEXT: vmaxnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x) + %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x) ret double %z } @@ -1129,7 +1129,7 @@ ; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) + %z = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x) ret float %z } @@ -1148,7 +1148,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x) + %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x) ret float %z } @@ -1172,7 +1172,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x) + %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x) ret float %z } @@ -1195,7 +1195,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) + %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x) ret half %z } @@ -1224,7 +1224,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) + %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x) ret half %z } @@ -1266,7 +1266,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x) + %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x) ret half %z } @@ -1275,7 +1275,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x) + %z = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x) ret double %z } @@ -1285,7 +1285,7 @@ ; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x) + %z = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x) ret double %z } @@ -1297,7 +1297,7 @@ ; CHECK-NEXT: vmaxnm.f64 d0, d0, d4 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x) + %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x) ret double %z } @@ -1308,7 +1308,7 @@ ; CHECK-NEXT: vmaxnm.f32 s0, s4, s0 ; CHECK-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) + %z = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x) %c = fcmp fast ogt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -1331,7 +1331,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x) + %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x) %c = fcmp fast ogt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -1367,7 +1367,7 @@ ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x) + %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x) %c = fcmp fast ogt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -1384,7 +1384,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %x) + %z = call fast half @llvm.vector.reduce.fmax.v2f16(<2 x half> %x) %c = fcmp fast ogt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -1417,7 +1417,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) + %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x) %c = fcmp fast ogt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -1456,7 +1456,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) + %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x) %c = fcmp fast ogt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -1524,7 +1524,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x) + %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x) %c = fcmp fast ogt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -1537,7 +1537,7 @@ ; CHECK-NEXT: vmaxnm.f64 d0, d1, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x) + %z = call fast double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x) %c = fcmp fast ogt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -1550,7 +1550,7 @@ ; CHECK-NEXT: vmaxnm.f64 d0, d2, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x) + %z = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x) %c = fcmp fast ogt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -1569,7 +1569,7 @@ ; CHECK-NEXT: vmaxnm.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x) + %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x) %c = fcmp fast ogt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -1584,7 +1584,7 @@ ; CHECK-NEXT: vselgt.f32 s0, s4, s0 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x) + %z = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %x) %c = fcmp ogt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -1611,7 +1611,7 @@ ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x) + %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x) %c = fcmp ogt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -1643,7 +1643,7 @@ ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %x) + %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x) %c = fcmp ogt float %y, %z %r = select i1 %c, float %y, float %z ret float %r @@ -1679,7 +1679,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x) + %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x) %c = fcmp ogt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -1722,7 +1722,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x) + %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x) %c = fcmp ogt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -1778,7 +1778,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half> %x) + %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x) %c = fcmp ogt half %y, %z %r = select i1 %c, half %y, half %z store half %r, half* %yy @@ -1793,7 +1793,7 @@ ; CHECK-NEXT: vselgt.f64 d0, d1, d0 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double> %x) + %z = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %x) %c = fcmp ogt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -1808,7 +1808,7 @@ ; CHECK-NEXT: vselgt.f64 d0, d2, d0 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %x) + %z = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %x) %c = fcmp ogt double %y, %z %r = select i1 %c, double %y, double %z ret double %r @@ -1825,29 +1825,29 @@ ; CHECK-NEXT: vselgt.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %x) + %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x) %c = fcmp ogt double %y, %z %r = select i1 %c, double %y, double %z ret double %r } -declare double @llvm.experimental.vector.reduce.fmax.v1f64(<1 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v1f64(<1 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) -declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) -declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>) -declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>) -declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>) -declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>) -declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>) -declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>) -declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>) -declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>) +declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>) +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) +declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>) +declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>) +declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>) +declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>) +declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>) +declare half @llvm.vector.reduce.fmin.v2f16(<2 x half>) +declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>) +declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: vmul.f32 s0, s4, s0 ; CHECK-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %y, <2 x float> %x) + %z = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x) ret float %z } @@ -30,7 +30,7 @@ ; CHECK-NOFP-NEXT: vmul.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %y, <4 x float> %x) + %z = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x) ret float %z } @@ -56,7 +56,7 @@ ; CHECK-NOFP-NEXT: vmul.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: - %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %y, <8 x float> %x) + %z = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x) ret float %z } @@ -71,7 +71,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x) + %z = call fast half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x) store half %z, half* %yy ret void } @@ -102,7 +102,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half %y, <4 x half> %x) + %z = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x) store half %z, half* %yy ret void } @@ -139,7 +139,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half %y, <8 x half> %x) + %z = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x) store half %z, half* %yy ret void } @@ -189,7 +189,7 @@ ; CHECK-NOFP-NEXT: bx lr entry: %y = load half, half* %yy - %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half %y, <16 x half> %x) + %z = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x) store half %z, half* %yy ret void } @@ -200,7 +200,7 @@ ; CHECK-NEXT: vmul.f64 d0, d1, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double %y, <1 x double> %x) + %z = call fast double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x) ret double %z } @@ -211,7 +211,7 @@ ; CHECK-NEXT: vmul.f64 d0, d2, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %y, <2 x double> %x) + %z = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x) ret double %z } @@ -224,7 +224,7 @@ ; CHECK-NEXT: vmul.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: - %z = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %y, <4 x double> %x) + %z = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x) ret double %z } @@ -235,7 +235,7 @@ ; CHECK-NEXT: vmul.f32 s0, s4, s1 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %y, <2 x float> %x) + %z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x) ret float %z } @@ -248,7 +248,7 @@ ; CHECK-NEXT: vmul.f32 s0, s4, s3 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %y, <4 x float> %x) + %z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x) ret float %z } @@ -265,7 +265,7 @@ ; CHECK-NEXT: vmul.f32 s0, s0, s7 ; CHECK-NEXT: bx lr entry: - %z = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %y, <8 x float> %x) + %z = call float @llvm.vector.reduce.fmul.f32.v8f32(float %y, <8 x float> %x) ret float %z } @@ -280,7 +280,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x) + %z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x) store half %z, half* %yy ret void } @@ -299,7 +299,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half %y, <4 x half> %x) + %z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x) store half %z, half* %yy ret void } @@ -324,7 +324,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half %y, <8 x half> %x) + %z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x) store half %z, half* %yy ret void } @@ -361,7 +361,7 @@ ; CHECK-NEXT: bx lr entry: %y = load half, half* %yy - %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half %y, <16 x half> %x) + %z = call half @llvm.vector.reduce.fmul.f16.v16f16(half %y, <16 x half> %x) store half %z, half* %yy ret void } @@ -372,7 +372,7 @@ ; CHECK-NEXT: vmul.f64 d0, d1, d0 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double %y, <1 x double> %x) + %z = call double @llvm.vector.reduce.fmul.f64.v1f64(double %y, <1 x double> %x) ret double %z } @@ -383,7 +383,7 @@ ; CHECK-NEXT: vmul.f64 d0, d2, d1 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %y, <2 x double> %x) + %z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x) ret double %z } @@ -396,17 +396,17 @@ ; CHECK-NEXT: vmul.f64 d0, d0, d3 ; CHECK-NEXT: bx lr entry: - %z = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %y, <4 x double> %x) + %z = call double @llvm.vector.reduce.fmul.f64.v4f64(double %y, <4 x double> %x) ret double %z } -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v1f64(double, <1 x double>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>) -declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half, <16 x half>) -declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half, <2 x half>) -declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>) -declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half, <8 x half>) +declare double @llvm.vector.reduce.fmul.f64.v1f64(double, <1 x double>) +declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) +declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>) +declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) +declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>) +declare half @llvm.vector.reduce.fmul.f16.v2f16(half, <2 x half>) +declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>) +declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -65,7 +65,7 @@ %0 = getelementptr inbounds i32, i32* %x, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 - %2 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %wide.load) + %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load) %3 = add i32 %2, %vec.phi %index.next = add i32 %index, 4 %4 = icmp eq i32 %index.next, %n.vec @@ -167,7 +167,7 @@ br i1 %3, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %2) + %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -267,7 +267,7 @@ br i1 %3, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %2) + %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -367,7 +367,7 @@ br i1 %3, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %2) + %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -467,7 +467,7 @@ br i1 %3, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %2) + %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -568,7 +568,7 @@ br i1 %3, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %4 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2) + %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -665,7 +665,7 @@ br i1 %3, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %4 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2) + %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -762,7 +762,7 @@ br i1 %4, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %3) + %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -854,7 +854,7 @@ %0 = getelementptr inbounds i32, i32* %x, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 - %l5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %wide.load) + %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load) %2 = icmp slt i32 %vec.phi, %l5 %3 = select i1 %2, i32 %vec.phi, i32 %l5 %index.next = add i32 %index, 4 @@ -960,7 +960,7 @@ br i1 %4, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %3) + %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -1052,7 +1052,7 @@ %0 = getelementptr inbounds i32, i32* %x, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 - %l5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %wide.load) + %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load) %2 = icmp sgt i32 %vec.phi, %l5 %3 = select i1 %2, i32 %vec.phi, i32 %l5 %index.next = add i32 %index, 4 @@ -1158,7 +1158,7 @@ br i1 %4, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %3) + %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -1250,7 +1250,7 @@ %0 = getelementptr inbounds i32, i32* %x, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 - %l5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %wide.load) + %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load) %2 = icmp ult i32 %vec.phi, %l5 %3 = select i1 %2, i32 %vec.phi, i32 %l5 %index.next = add i32 %index, 4 @@ -1356,7 +1356,7 @@ br i1 %4, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %3) + %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -1447,7 +1447,7 @@ %0 = getelementptr inbounds i32, i32* %x, i32 %index %1 = bitcast i32* %0 to <4 x i32>* %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 - %l5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %wide.load) + %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load) %2 = icmp ugt i32 %vec.phi, %l5 %3 = select i1 %2, i32 %vec.phi, i32 %l5 %index.next = add i32 %index, 4 @@ -1560,7 +1560,7 @@ br i1 %4, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %5 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %3) + %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -1665,7 +1665,7 @@ br i1 %4, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %5 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %3) + %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3) %cmp.n = icmp eq i32 %n.vec, %n br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 @@ -1729,7 +1729,7 @@ %1 = bitcast i32* %0 to <4 x i32>* %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer - %3 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %2) + %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) %4 = add i32 %3, %vec.phi %index.next = add i32 %index, 4 %5 = icmp eq i32 %index.next, %n.vec @@ -1784,7 +1784,7 @@ %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load %5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer - %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5) + %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) %7 = add i32 %6, %vec.phi %index.next = add i32 %index, 4 %8 = icmp eq i32 %index.next, %n.vec @@ -1835,7 +1835,7 @@ %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) %2 = sext <8 x i16> %wide.masked.load to <8 x i32> %3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer - %4 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %3) + %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) %5 = add i32 %4, %vec.phi %index.next = add i32 %index, 8 %6 = icmp eq i32 %index.next, %n.vec @@ -1892,7 +1892,7 @@ %5 = sext <8 x i16> %wide.masked.load14 to <8 x i32> %6 = mul nsw <8 x i32> %5, %2 %7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer - %8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %7) + %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7) %9 = add i32 %8, %vec.phi %index.next = add i32 %index, 8 %10 = icmp eq i32 %index.next, %n.vec @@ -1943,7 +1943,7 @@ %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) %2 = zext <16 x i8> %wide.masked.load to <16 x i32> %3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer - %4 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %3) + %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) %5 = add i32 %4, %vec.phi %index.next = add i32 %index, 16 %6 = icmp eq i32 %index.next, %n.vec @@ -2000,7 +2000,7 @@ %5 = zext <16 x i8> %wide.masked.load14 to <16 x i32> %6 = mul nuw nsw <16 x i32> %5, %2 %7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer - %8 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %7) + %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7) %9 = add i32 %8, %vec.phi %index.next = add i32 %index, 16 %10 = icmp eq i32 %index.next, %n.vec @@ -2050,7 +2050,7 @@ %1 = bitcast i16* %0 to <8 x i16>* %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) %2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer - %3 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %2) + %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) %4 = add i16 %3, %vec.phi %index.next = add i32 %index, 8 %5 = icmp eq i32 %index.next, %n.vec @@ -2105,7 +2105,7 @@ %wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) %4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load %5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer - %6 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %5) + %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5) %7 = add i16 %6, %vec.phi %index.next = add i32 %index, 8 %8 = icmp eq i32 %index.next, %n.vec @@ -2156,7 +2156,7 @@ %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) %2 = zext <16 x i8> %wide.masked.load to <16 x i16> %3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer - %4 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %3) + %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3) %5 = add i16 %4, %vec.phi %index.next = add i32 %index, 16 %6 = icmp eq i32 %index.next, %n.vec @@ -2213,7 +2213,7 @@ %5 = zext <16 x i8> %wide.masked.load18 to <16 x i16> %6 = mul nuw <16 x i16> %5, %2 %7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer - %8 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %7) + %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7) %9 = add i16 %8, %vec.phi %index.next = add i32 %index, 16 %10 = icmp eq i32 %index.next, %n.vec @@ -2263,7 +2263,7 @@ %1 = bitcast i8* %0 to <16 x i8>* %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) %2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer - %3 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %2) + %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2) %4 = add i8 %3, %vec.phi %index.next = add i32 %index, 16 %5 = icmp eq i32 %index.next, %n.vec @@ -2318,7 +2318,7 @@ %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load %5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer - %6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %5) + %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5) %7 = add i8 %6, %vec.phi %index.next = add i32 %index, 16 %8 = icmp eq i32 %index.next, %n.vec @@ -2371,7 +2371,7 @@ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) %2 = sext <4 x i32> %wide.masked.load to <4 x i64> %3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer - %4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %3) + %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3) %5 = add i64 %4, %vec.phi %index.next = add i32 %index, 4 %6 = icmp eq i32 %index.next, %n.vec @@ -2430,7 +2430,7 @@ %5 = sext <4 x i32> %wide.masked.load14 to <4 x i64> %6 = mul nsw <4 x i64> %5, %2 %7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer - %8 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %7) + %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7) %9 = add i64 %8, %vec.phi %index.next = add i32 %index, 4 %10 = icmp eq i32 %index.next, %n.vec @@ -2489,7 +2489,7 @@ %5 = sext <8 x i16> %wide.masked.load14 to <8 x i64> %6 = mul nsw <8 x i64> %5, %2 %7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer - %8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %7) + %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7) %9 = add i64 %8, %vec.phi %index.next = add i32 %index, 8 %10 = icmp eq i32 %index.next, %n.vec @@ -2504,26 +2504,26 @@ declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2 -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) #3 +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2 -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) #3 -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) #3 -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) #3 -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) #3 -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) #3 -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) #3 - -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) -declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3 +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3 +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3 +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3 +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3 +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3 + +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: bx lr entry: %m = mul <4 x i32> %x, %y - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) ret i32 %z } @@ -21,7 +21,7 @@ %xx = zext <4 x i32> %x to <4 x i64> %yy = zext <4 x i32> %y to <4 x i64> %m = mul <4 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m) ret i64 %z } @@ -34,7 +34,7 @@ %xx = sext <4 x i32> %x to <4 x i64> %yy = sext <4 x i32> %y to <4 x i64> %m = mul <4 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m) ret i64 %z } @@ -53,7 +53,7 @@ %xx = zext <2 x i32> %x to <2 x i64> %yy = zext <2 x i32> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) ret i64 %z } @@ -72,7 +72,7 @@ %xx = sext <2 x i32> %x to <2 x i64> %yy = sext <2 x i32> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) ret i64 %z } @@ -85,7 +85,7 @@ %xx = zext <8 x i16> %x to <8 x i32> %yy = zext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m) ret i32 %z } @@ -98,7 +98,7 @@ %xx = sext <8 x i16> %x to <8 x i32> %yy = sext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m) ret i32 %z } @@ -113,7 +113,7 @@ %xx = zext <4 x i16> %x to <4 x i32> %yy = zext <4 x i16> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) ret i32 %z } @@ -128,7 +128,7 @@ %xx = sext <4 x i16> %x to <4 x i32> %yy = sext <4 x i16> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) ret i32 %z } @@ -140,7 +140,7 @@ ; CHECK-NEXT: bx lr entry: %m = mul <8 x i16> %x, %y - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m) ret i16 %z } @@ -153,7 +153,7 @@ %xx = zext <8 x i16> %x to <8 x i64> %yy = zext <8 x i16> %y to <8 x i64> %m = mul <8 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m) ret i64 %z } @@ -166,7 +166,7 @@ %xx = sext <8 x i16> %x to <8 x i64> %yy = sext <8 x i16> %y to <8 x i64> %m = mul <8 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m) ret i64 %z } @@ -180,7 +180,7 @@ %yy = zext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy %ma = zext <8 x i32> %m to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma) ret i64 %z } @@ -194,7 +194,7 @@ %yy = sext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy %ma = sext <8 x i32> %m to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma) ret i64 %z } @@ -207,7 +207,7 @@ %xx = sext <8 x i16> %x to <8 x i32> %m = mul <8 x i32> %xx, %xx %ma = zext <8 x i32> %m to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma) ret i64 %z } @@ -228,7 +228,7 @@ %xx = zext <2 x i16> %x to <2 x i64> %yy = zext <2 x i16> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) ret i64 %z } @@ -250,7 +250,7 @@ %xx = sext <2 x i16> %x to <2 x i64> %yy = sext <2 x i16> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) ret i64 %z } @@ -263,7 +263,7 @@ %xx = zext <16 x i8> %x to <16 x i32> %yy = zext <16 x i8> %y to <16 x i32> %m = mul <16 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m) ret i32 %z } @@ -276,7 +276,7 @@ %xx = sext <16 x i8> %x to <16 x i32> %yy = sext <16 x i8> %y to <16 x i32> %m = mul <16 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m) ret i32 %z } @@ -290,7 +290,7 @@ %yy = zext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy %ma = zext <16 x i16> %m to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma) ret i32 %z } @@ -304,7 +304,7 @@ %yy = sext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy %ma = sext <16 x i16> %m to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma) ret i32 %z } @@ -317,7 +317,7 @@ %xx = sext <16 x i8> %x to <16 x i16> %m = mul <16 x i16> %xx, %xx %ma = zext <16 x i16> %m to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma) ret i32 %z } @@ -333,7 +333,7 @@ %xx = zext <4 x i8> %x to <4 x i32> %yy = zext <4 x i8> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) ret i32 %z } @@ -350,7 +350,7 @@ %xx = sext <4 x i8> %x to <4 x i32> %yy = sext <4 x i8> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) ret i32 %z } @@ -364,7 +364,7 @@ %xx = zext <16 x i8> %x to <16 x i16> %yy = zext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m) ret i16 %z } @@ -378,7 +378,7 @@ %xx = sext <16 x i8> %x to <16 x i16> %yy = sext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m) ret i16 %z } @@ -394,7 +394,7 @@ %xx = zext <8 x i8> %x to <8 x i16> %yy = zext <8 x i8> %y to <8 x i16> %m = mul <8 x i16> %xx, %yy - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m) ret i16 %z } @@ -410,7 +410,7 @@ %xx = sext <8 x i8> %x to <8 x i16> %yy = sext <8 x i8> %y to <8 x i16> %m = mul <8 x i16> %xx, %yy - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m) ret i16 %z } @@ -422,7 +422,7 @@ ; CHECK-NEXT: bx lr entry: %m = mul <16 x i8> %x, %y - %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m) + %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m) ret i8 %z } @@ -636,7 +636,7 @@ %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> %m = mul <16 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m) ret i64 %z } @@ -803,7 +803,7 @@ %xx = sext <16 x i8> %x to <16 x i64> %yy = sext <16 x i8> %y to <16 x i64> %m = mul <16 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m) ret i64 %z } @@ -826,7 +826,7 @@ %xx = zext <2 x i8> %x to <2 x i64> %yy = zext <2 x i8> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) ret i64 %z } @@ -848,7 +848,7 @@ %xx = sext <2 x i8> %x to <2 x i64> %yy = sext <2 x i8> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) ret i64 %z } @@ -879,7 +879,7 @@ ; CHECK-NEXT: pop {r4, pc} entry: %m = mul <2 x i64> %x, %y - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) ret i64 %z } @@ -890,7 +890,7 @@ ; CHECK-NEXT: bx lr entry: %m = mul <4 x i32> %x, %y - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) %r = add i32 %z, %a ret i32 %r } @@ -904,7 +904,7 @@ %xx = zext <4 x i32> %x to <4 x i64> %yy = zext <4 x i32> %y to <4 x i64> %m = mul <4 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -918,7 +918,7 @@ %xx = sext <4 x i32> %x to <4 x i64> %yy = sext <4 x i32> %y to <4 x i64> %m = mul <4 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -942,7 +942,7 @@ %xx = zext <2 x i32> %x to <2 x i64> %yy = zext <2 x i32> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -966,7 +966,7 @@ %xx = sext <2 x i32> %x to <2 x i64> %yy = sext <2 x i32> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -980,7 +980,7 @@ %xx = zext <8 x i16> %x to <8 x i32> %yy = zext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m) %r = add i32 %z, %a ret i32 %r } @@ -994,7 +994,7 @@ %xx = sext <8 x i16> %x to <8 x i32> %yy = sext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m) %r = add i32 %z, %a ret i32 %r } @@ -1010,7 +1010,7 @@ %xx = zext <4 x i16> %x to <4 x i32> %yy = zext <4 x i16> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) %r = add i32 %z, %a ret i32 %r } @@ -1026,7 +1026,7 @@ %xx = sext <4 x i16> %x to <4 x i32> %yy = sext <4 x i16> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) %r = add i32 %z, %a ret i32 %r } @@ -1039,7 +1039,7 @@ ; CHECK-NEXT: bx lr entry: %m = mul <8 x i16> %x, %y - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m) %r = add i16 %z, %a ret i16 %r } @@ -1053,7 +1053,7 @@ %xx = zext <8 x i16> %x to <8 x i64> %yy = zext <8 x i16> %y to <8 x i64> %m = mul <8 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -1067,7 +1067,7 @@ %xx = sext <8 x i16> %x to <8 x i64> %yy = sext <8 x i16> %y to <8 x i64> %m = mul <8 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -1082,7 +1082,7 @@ %yy = zext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy %ma = zext <8 x i32> %m to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma) %r = add i64 %z, %a ret i64 %r } @@ -1097,7 +1097,7 @@ %yy = sext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy %ma = sext <8 x i32> %m to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma) %r = add i64 %z, %a ret i64 %r } @@ -1111,7 +1111,7 @@ %xx = sext <8 x i16> %x to <8 x i32> %m = mul <8 x i32> %xx, %xx %ma = zext <8 x i32> %m to <8 x i64> - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %ma) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma) %r = add i64 %z, %a ret i64 %r } @@ -1137,7 +1137,7 @@ %xx = zext <2 x i16> %x to <2 x i64> %yy = zext <2 x i16> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -1164,7 +1164,7 @@ %xx = sext <2 x i16> %x to <2 x i64> %yy = sext <2 x i16> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -1178,7 +1178,7 @@ %xx = zext <16 x i8> %x to <16 x i32> %yy = zext <16 x i8> %y to <16 x i32> %m = mul <16 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m) %r = add i32 %z, %a ret i32 %r } @@ -1192,7 +1192,7 @@ %xx = sext <16 x i8> %x to <16 x i32> %yy = sext <16 x i8> %y to <16 x i32> %m = mul <16 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m) %r = add i32 %z, %a ret i32 %r } @@ -1207,7 +1207,7 @@ %yy = zext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy %ma = zext <16 x i16> %m to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma) %r = add i32 %z, %a ret i32 %r } @@ -1222,7 +1222,7 @@ %yy = sext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy %ma = sext <16 x i16> %m to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma) %r = add i32 %z, %a ret i32 %r } @@ -1236,7 +1236,7 @@ %xx = sext <16 x i8> %x to <16 x i16> %m = mul <16 x i16> %xx, %xx %ma = zext <16 x i16> %m to <16 x i32> - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %ma) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma) %r = add i32 %z, %a ret i32 %r } @@ -1253,7 +1253,7 @@ %xx = zext <4 x i8> %x to <4 x i32> %yy = zext <4 x i8> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) %r = add i32 %z, %a ret i32 %r } @@ -1271,7 +1271,7 @@ %xx = sext <4 x i8> %x to <4 x i32> %yy = sext <4 x i8> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m) %r = add i32 %z, %a ret i32 %r } @@ -1286,7 +1286,7 @@ %xx = zext <16 x i8> %x to <16 x i16> %yy = zext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m) %r = add i16 %z, %a ret i16 %r } @@ -1301,7 +1301,7 @@ %xx = sext <16 x i8> %x to <16 x i16> %yy = sext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m) %r = add i16 %z, %a ret i16 %r } @@ -1318,7 +1318,7 @@ %xx = zext <8 x i8> %x to <8 x i16> %yy = zext <8 x i8> %y to <8 x i16> %m = mul <8 x i16> %xx, %yy - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m) %r = add i16 %z, %a ret i16 %r } @@ -1335,7 +1335,7 @@ %xx = sext <8 x i8> %x to <8 x i16> %yy = sext <8 x i8> %y to <8 x i16> %m = mul <8 x i16> %xx, %yy - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m) %r = add i16 %z, %a ret i16 %r } @@ -1348,7 +1348,7 @@ ; CHECK-NEXT: bx lr entry: %m = mul <16 x i8> %x, %y - %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m) + %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m) %r = add i8 %z, %a ret i8 %r } @@ -1565,7 +1565,7 @@ %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> %m = mul <16 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -1737,7 +1737,7 @@ %xx = sext <16 x i8> %x to <16 x i64> %yy = sext <16 x i8> %y to <16 x i64> %m = mul <16 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -1765,7 +1765,7 @@ %xx = zext <2 x i8> %x to <2 x i64> %yy = zext <2 x i8> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -1792,7 +1792,7 @@ %xx = sext <2 x i8> %x to <2 x i64> %yy = sext <2 x i8> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) %r = add i64 %z, %a ret i64 %r } @@ -1826,18 +1826,18 @@ ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %m = mul <2 x i64> %x, %y - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) %r = add i64 %z, %a ret i64 %r } -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) -declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -11,7 +11,7 @@ %c = icmp eq <4 x i32> %b, zeroinitializer %m = mul <4 x i32> %x, %y %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -27,7 +27,7 @@ %yy = zext <4 x i32> %y to <4 x i64> %m = mul <4 x i64> %xx, %yy %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) ret i64 %z } @@ -43,7 +43,7 @@ %yy = sext <4 x i32> %y to <4 x i64> %m = mul <4 x i64> %xx, %yy %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) ret i64 %z } @@ -79,7 +79,7 @@ %yy = zext <2 x i32> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -115,7 +115,7 @@ %yy = sext <2 x i32> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -131,7 +131,7 @@ %yy = zext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) ret i32 %z } @@ -147,7 +147,7 @@ %yy = sext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) ret i32 %z } @@ -166,7 +166,7 @@ %yy = zext <4 x i16> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -185,7 +185,7 @@ %yy = sext <4 x i16> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -200,7 +200,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %m = mul <8 x i16> %x, %y %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) ret i16 %z } @@ -216,7 +216,7 @@ %yy = zext <8 x i16> %y to <8 x i64> %m = mul <8 x i64> %xx, %yy %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) ret i64 %z } @@ -232,7 +232,7 @@ %yy = sext <8 x i16> %y to <8 x i64> %m = mul <8 x i64> %xx, %yy %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) ret i64 %z } @@ -249,7 +249,7 @@ %m = mul <8 x i32> %xx, %yy %ma = zext <8 x i32> %m to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) ret i64 %z } @@ -266,7 +266,7 @@ %m = mul <8 x i32> %xx, %yy %ma = sext <8 x i32> %m to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) ret i64 %z } @@ -282,7 +282,7 @@ %m = mul <8 x i32> %xx, %xx %ma = zext <8 x i32> %m to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) ret i64 %z } @@ -334,7 +334,7 @@ %yy = zext <2 x i16> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -385,7 +385,7 @@ %yy = sext <2 x i16> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -401,7 +401,7 @@ %yy = zext <16 x i8> %y to <16 x i32> %m = mul <16 x i32> %xx, %yy %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) ret i32 %z } @@ -417,7 +417,7 @@ %yy = sext <16 x i8> %y to <16 x i32> %m = mul <16 x i32> %xx, %yy %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) ret i32 %z } @@ -434,7 +434,7 @@ %m = mul <16 x i16> %xx, %yy %ma = zext <16 x i16> %m to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) ret i32 %z } @@ -451,7 +451,7 @@ %m = mul <16 x i16> %xx, %yy %ma = sext <16 x i16> %m to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) ret i32 %z } @@ -467,7 +467,7 @@ %m = mul <16 x i16> %xx, %xx %ma = zext <16 x i16> %m to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) ret i32 %z } @@ -487,7 +487,7 @@ %yy = zext <4 x i8> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -509,7 +509,7 @@ %yy = sext <4 x i8> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) ret i32 %z } @@ -526,7 +526,7 @@ %yy = zext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) ret i16 %z } @@ -543,7 +543,7 @@ %yy = sext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) ret i16 %z } @@ -563,7 +563,7 @@ %yy = zext <8 x i8> %y to <8 x i16> %m = mul <8 x i16> %xx, %yy %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) ret i16 %z } @@ -583,7 +583,7 @@ %yy = sext <8 x i8> %y to <8 x i16> %m = mul <8 x i16> %xx, %yy %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) ret i16 %z } @@ -598,7 +598,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %m = mul <16 x i8> %x, %y %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer - %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s) + %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s) ret i8 %z } @@ -1010,7 +1010,7 @@ %yy = zext <16 x i8> %y to <16 x i64> %m = mul <16 x i64> %xx, %yy %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) ret i64 %z } @@ -1353,7 +1353,7 @@ %yy = sext <16 x i8> %y to <16 x i64> %m = mul <16 x i64> %xx, %yy %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) ret i64 %z } @@ -1405,7 +1405,7 @@ %yy = zext <2 x i8> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -1456,7 +1456,7 @@ %yy = sext <2 x i8> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -1509,7 +1509,7 @@ %c = icmp eq <2 x i64> %b, zeroinitializer %m = mul <2 x i64> %x, %y %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) ret i64 %z } @@ -1523,7 +1523,7 @@ %c = icmp eq <4 x i32> %b, zeroinitializer %m = mul <4 x i32> %x, %y %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1540,7 +1540,7 @@ %yy = zext <4 x i32> %y to <4 x i64> %m = mul <4 x i64> %xx, %yy %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1557,7 +1557,7 @@ %yy = sext <4 x i32> %y to <4 x i64> %m = mul <4 x i64> %xx, %yy %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1598,7 +1598,7 @@ %yy = zext <2 x i32> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1639,7 +1639,7 @@ %yy = sext <2 x i32> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1656,7 +1656,7 @@ %yy = zext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1673,7 +1673,7 @@ %yy = sext <8 x i16> %y to <8 x i32> %m = mul <8 x i32> %xx, %yy %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1693,7 +1693,7 @@ %yy = zext <4 x i16> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1713,7 +1713,7 @@ %yy = sext <4 x i16> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1729,7 +1729,7 @@ %c = icmp eq <8 x i16> %b, zeroinitializer %m = mul <8 x i16> %x, %y %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -1746,7 +1746,7 @@ %yy = zext <8 x i16> %y to <8 x i64> %m = mul <8 x i64> %xx, %yy %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1763,7 +1763,7 @@ %yy = sext <8 x i16> %y to <8 x i64> %m = mul <8 x i64> %xx, %yy %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1781,7 +1781,7 @@ %m = mul <8 x i32> %xx, %yy %ma = zext <8 x i32> %m to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1799,7 +1799,7 @@ %m = mul <8 x i32> %xx, %yy %ma = sext <8 x i32> %m to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1816,7 +1816,7 @@ %m = mul <8 x i32> %xx, %xx %ma = zext <8 x i32> %m to <8 x i64> %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1873,7 +1873,7 @@ %yy = zext <2 x i16> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1929,7 +1929,7 @@ %yy = sext <2 x i16> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -1946,7 +1946,7 @@ %yy = zext <16 x i8> %y to <16 x i32> %m = mul <16 x i32> %xx, %yy %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1963,7 +1963,7 @@ %yy = sext <16 x i8> %y to <16 x i32> %m = mul <16 x i32> %xx, %yy %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1981,7 +1981,7 @@ %m = mul <16 x i16> %xx, %yy %ma = zext <16 x i16> %m to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -1999,7 +1999,7 @@ %m = mul <16 x i16> %xx, %yy %ma = sext <16 x i16> %m to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -2016,7 +2016,7 @@ %m = mul <16 x i16> %xx, %xx %ma = zext <16 x i16> %m to <16 x i32> %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -2037,7 +2037,7 @@ %yy = zext <4 x i8> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -2060,7 +2060,7 @@ %yy = sext <4 x i8> %y to <4 x i32> %m = mul <4 x i32> %xx, %yy %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer - %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %s) + %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s) %r = add i32 %z, %a ret i32 %r } @@ -2078,7 +2078,7 @@ %yy = zext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -2096,7 +2096,7 @@ %yy = sext <16 x i8> %y to <16 x i16> %m = mul <16 x i16> %xx, %yy %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -2117,7 +2117,7 @@ %yy = zext <8 x i8> %y to <8 x i16> %m = mul <8 x i16> %xx, %yy %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -2138,7 +2138,7 @@ %yy = sext <8 x i8> %y to <8 x i16> %m = mul <8 x i16> %xx, %yy %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer - %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %s) + %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s) %r = add i16 %z, %a ret i16 %r } @@ -2154,7 +2154,7 @@ %c = icmp eq <16 x i8> %b, zeroinitializer %m = mul <16 x i8> %x, %y %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer - %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %s) + %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s) %r = add i8 %z, %a ret i8 %r } @@ -2569,7 +2569,7 @@ %yy = zext <16 x i8> %y to <16 x i64> %m = mul <16 x i64> %xx, %yy %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -2917,7 +2917,7 @@ %yy = sext <16 x i8> %y to <16 x i64> %m = mul <16 x i64> %xx, %yy %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -2974,7 +2974,7 @@ %yy = zext <2 x i8> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -3030,7 +3030,7 @@ %yy = sext <2 x i8> %y to <2 x i64> %m = mul <2 x i64> %xx, %yy %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } @@ -3088,18 +3088,18 @@ %c = icmp eq <2 x i64> %b, zeroinitializer %m = mul <2 x i64> %x, %y %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer - %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %s) + %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s) %r = add i64 %z, %a ret i64 %r } -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) -declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %x) + %z = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %x) ret i32 %z } @@ -25,7 +25,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %x) ret i32 %z } @@ -42,7 +42,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %x) + %z = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %x) ret i32 %z } @@ -58,7 +58,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %x) + %z = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %x) ret i16 %z } @@ -76,7 +76,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %x) ret i16 %z } @@ -95,7 +95,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %x) + %z = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %x) ret i16 %z } @@ -113,7 +113,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %x) + %z = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %x) ret i8 %z } @@ -133,7 +133,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %x) ret i8 %z } @@ -154,7 +154,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %x) + %z = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %x) ret i8 %z } @@ -163,7 +163,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> %x) + %z = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %x) ret i64 %z } @@ -179,7 +179,7 @@ ; CHECK-NEXT: mla r1, r3, r1, r2 ; CHECK-NEXT: bx lr entry: - %z = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x) ret i64 %z } @@ -207,7 +207,7 @@ ; CHECK-NEXT: mla r1, r1, r6, r4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: - %z = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %x) + %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x) ret i64 %z } @@ -220,7 +220,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %x) + %z = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %x) %r = mul i32 %y, %z ret i32 %r } @@ -238,7 +238,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %x) + %z = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %x) %r = mul i32 %y, %z ret i32 %r } @@ -257,7 +257,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %x) + %z = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %x) %r = mul i32 %y, %z ret i32 %r } @@ -275,7 +275,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %x) + %z = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %x) %r = mul i16 %y, %z ret i16 %r } @@ -295,7 +295,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %x) + %z = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %x) %r = mul i16 %y, %z ret i16 %r } @@ -316,7 +316,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %x) + %z = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %x) %r = mul i16 %y, %z ret i16 %r } @@ -336,7 +336,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %x) + %z = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %x) %r = mul i8 %y, %z ret i8 %r } @@ -358,7 +358,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %x) + %z = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %x) %r = mul i8 %y, %z ret i8 %r } @@ -381,7 +381,7 @@ ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr entry: - %z = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %x) + %z = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %x) %r = mul i8 %y, %z ret i8 %r } @@ -397,7 +397,7 @@ ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} entry: - %z = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> %x) + %z = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %x) %r = mul i64 %y, %z ret i64 %r } @@ -420,7 +420,7 @@ ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r4, pc} entry: - %z = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %x) + %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x) %r = mul i64 %y, %z ret i64 %r } @@ -453,20 +453,20 @@ ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: - %z = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %x) + %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x) %r = mul i64 %y, %z ret i64 %r } -declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>) -declare i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>) -declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>) +declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>) +declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>) +declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>) +declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll @@ -1,18 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s -declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) -declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) -declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) -declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>) -declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>) -declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) +declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) +declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) +declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) +declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8(<16 x i8> %s1) { ; CHECK-LABEL: vmaxv_s_v16i8: @@ -20,7 +20,7 @@ ; CHECK-NEXT: mvn r0, #127 ; CHECK-NEXT: vmaxv.s8 r0, q0 ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %s1) ret i8 %r } @@ -31,7 +31,7 @@ ; CHECK-NEXT: movt r0, #65535 ; CHECK-NEXT: vmaxv.s16 r0, q0 ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %s1) ret i16 %r } @@ -41,7 +41,7 @@ ; CHECK-NEXT: mov.w r0, #-2147483648 ; CHECK-NEXT: vmaxv.s32 r0, q0 ; CHECK-NEXT: bx lr - %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %s1) + %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %s1) ret i32 %r } @@ -51,7 +51,7 @@ ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: vmaxv.u8 r0, q0 ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %s1) ret i8 %r } @@ -61,7 +61,7 @@ ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: vmaxv.u16 r0, q0 ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %s1) ret i16 %r } @@ -71,7 +71,7 @@ ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: vmaxv.u32 r0, q0 ; CHECK-NEXT: bx lr - %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %s1) + %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %s1) ret i32 %r } @@ -81,7 +81,7 @@ ; CHECK-NEXT: movs r0, #127 ; CHECK-NEXT: vminv.s8 r0, q0 ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %s1) ret i8 %r } @@ -91,7 +91,7 @@ ; CHECK-NEXT: movw r0, #32767 ; CHECK-NEXT: vminv.s16 r0, q0 ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %s1) ret i16 %r } @@ -101,7 +101,7 @@ ; CHECK-NEXT: mvn r0, #-2147483648 ; CHECK-NEXT: vminv.s32 r0, q0 ; CHECK-NEXT: bx lr - %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %s1) + %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %s1) ret i32 %r } @@ -111,7 +111,7 @@ ; CHECK-NEXT: movs r0, #255 ; CHECK-NEXT: vminv.u8 r0, q0 ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %s1) ret i8 %r } @@ -121,7 +121,7 @@ ; CHECK-NEXT: movw r0, #65535 ; CHECK-NEXT: vminv.u16 r0, q0 ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %s1) ret i16 %r } @@ -131,7 +131,7 @@ ; CHECK-NEXT: mov.w r0, #-1 ; CHECK-NEXT: vminv.u32 r0, q0 ; CHECK-NEXT: bx lr - %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %s1) + %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %s1) ret i32 %r } @@ -147,7 +147,7 @@ ; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: csel r0, r1, r0, gt ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %s1) %c = icmp sgt i8 %r, %s2 %s = select i1 %c, i8 %r, i8 %s2 ret i8 %s @@ -162,7 +162,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, gt ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %s1) %rs = sext i8 %r to i32 %c = icmp sgt i32 %rs, %s2 %s = select i1 %c, i32 %rs, i32 %s2 @@ -180,7 +180,7 @@ ; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: csel r0, r1, r0, gt ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %s1) %c = icmp sgt i16 %r, %s2 %s = select i1 %c, i16 %r, i16 %s2 ret i16 %s @@ -196,7 +196,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, gt ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %s1) %rs = sext i16 %r to i32 %c = icmp sgt i32 %rs, %s2 %s = select i1 %c, i32 %rs, i32 %s2 @@ -211,7 +211,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, gt ; CHECK-NEXT: bx lr - %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %s1) + %r = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %s1) %c = icmp sgt i32 %r, %s2 %s = select i1 %c, i32 %r, i32 %s2 ret i32 %s @@ -227,7 +227,7 @@ ; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: csel r0, r1, r0, hi ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %s1) %c = icmp ugt i8 %r, %s2 %s = select i1 %c, i8 %r, i8 %s2 ret i8 %s @@ -242,7 +242,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, hi ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %s1) %rs = zext i8 %r to i32 %c = icmp ugt i32 %rs, %s2 %s = select i1 %c, i32 %rs, i32 %s2 @@ -259,7 +259,7 @@ ; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: csel r0, r1, r0, hi ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %s1) %c = icmp ugt i16 %r, %s2 %s = select i1 %c, i16 %r, i16 %s2 ret i16 %s @@ -274,7 +274,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, hi ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %s1) %rs = zext i16 %r to i32 %c = icmp ugt i32 %rs, %s2 %s = select i1 %c, i32 %rs, i32 %s2 @@ -289,7 +289,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, hi ; CHECK-NEXT: bx lr - %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %s1) + %r = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %s1) %c = icmp ugt i32 %r, %s2 %s = select i1 %c, i32 %r, i32 %s2 ret i32 %s @@ -305,7 +305,7 @@ ; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: csel r0, r1, r0, lt ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %s1) %c = icmp slt i8 %r, %s2 %s = select i1 %c, i8 %r, i8 %s2 ret i8 %s @@ -320,7 +320,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, lt ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %s1) %rs = sext i8 %r to i32 %c = icmp slt i32 %rs, %s2 %s = select i1 %c, i32 %rs, i32 %s2 @@ -337,7 +337,7 @@ ; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: csel r0, r1, r0, lt ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %s1) %c = icmp slt i16 %r, %s2 %s = select i1 %c, i16 %r, i16 %s2 ret i16 %s @@ -352,7 +352,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, lt ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %s1) %rs = sext i16 %r to i32 %c = icmp slt i32 %rs, %s2 %s = select i1 %c, i32 %rs, i32 %s2 @@ -367,7 +367,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, lt ; CHECK-NEXT: bx lr - %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %s1) + %r = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %s1) %c = icmp slt i32 %r, %s2 %s = select i1 %c, i32 %r, i32 %s2 ret i32 %s @@ -383,7 +383,7 @@ ; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: csel r0, r1, r0, lo ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %s1) %c = icmp ult i8 %r, %s2 %s = select i1 %c, i8 %r, i8 %s2 ret i8 %s @@ -398,7 +398,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, lo ; CHECK-NEXT: bx lr - %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1) + %r = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %s1) %rs = zext i8 %r to i32 %c = icmp ult i32 %rs, %s2 %s = select i1 %c, i32 %rs, i32 %s2 @@ -415,7 +415,7 @@ ; CHECK-NEXT: cmp r2, r3 ; CHECK-NEXT: csel r0, r1, r0, lo ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %s1) %c = icmp ult i16 %r, %s2 %s = select i1 %c, i16 %r, i16 %s2 ret i16 %s @@ -430,7 +430,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, lo ; CHECK-NEXT: bx lr - %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1) + %r = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %s1) %rs = zext i16 %r to i32 %c = icmp ult i32 %rs, %s2 %s = select i1 %c, i32 %rs, i32 %s2 @@ -445,7 +445,7 @@ ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: csel r0, r1, r0, lo ; CHECK-NEXT: bx lr - %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %s1) + %r = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %s1) %c = icmp ult i32 %r, %s2 %s = select i1 %c, i32 %r, i32 %s2 ret i32 %s diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -1628,8 +1628,8 @@ ; Repeat tests from general reductions to verify output for hoppy targets: ; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971 -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) +declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) +declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) { ; SSE3-SLOW-LABEL: fadd_reduce_v8f32: @@ -1672,7 +1672,7 @@ ; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq - %r = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1) + %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %r } @@ -1711,7 +1711,7 @@ ; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq - %r = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1) + %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %r } diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll --- a/llvm/test/CodeGen/X86/pr45378.ll +++ b/llvm/test/CodeGen/X86/pr45378.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW -declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) define i1 @parseHeaders(i64 * %ptr) nounwind { ; SSE2-LABEL: parseHeaders: @@ -34,7 +34,7 @@ ; AVX-NEXT: retq %vptr = bitcast i64 * %ptr to <2 x i64> * %vload = load <2 x i64>, <2 x i64> * %vptr, align 8 - %vreduce = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %vload) + %vreduce = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %vload) %vcheck = icmp eq i64 %vreduce, 0 ret i1 %vcheck } diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -32,7 +32,7 @@ ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a0) ret i64 %1 } @@ -74,7 +74,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a0) ret i64 %1 } @@ -124,7 +124,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a0) ret i64 %1 } @@ -187,7 +187,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a0) ret i64 %1 } @@ -229,7 +229,7 @@ ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -276,7 +276,7 @@ ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a0) ret i32 %1 } @@ -336,7 +336,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a0) ret i32 %1 } @@ -408,7 +408,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a0) ret i32 %1 } @@ -499,7 +499,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %a0) ret i32 %1 } @@ -547,7 +547,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -601,7 +601,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a0) ret i16 %1 } @@ -663,7 +663,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -738,7 +738,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -826,7 +826,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -933,7 +933,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -966,7 +966,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1002,7 +1002,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq %a0 = load <2 x i8>, <2 x i8>* %p - %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1043,7 +1043,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -1075,7 +1075,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq %a0 = load <4 x i8>, <4 x i8>* %p - %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -1103,7 +1103,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -1135,7 +1135,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq %a0 = load <8 x i8>, <8 x i8>* %p - %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -1169,7 +1169,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -1223,7 +1223,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1285,7 +1285,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -1360,32 +1360,32 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %a0) ret i8 %1 } -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -59,7 +59,7 @@ ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) ret i1 %b } @@ -111,7 +111,7 @@ ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } @@ -176,7 +176,7 @@ ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <8 x i8> %0 to <8 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b } @@ -205,7 +205,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq %a = trunc <16 x i8> %0 to <16 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b } @@ -262,7 +262,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <4 x i64> %0 to <4 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } @@ -351,7 +351,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <8 x i32> %0 to <8 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b } @@ -420,7 +420,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <16 x i16> %0 to <16 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b } @@ -492,7 +492,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <32 x i8> %0 to <32 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b } @@ -597,7 +597,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <8 x i64> %0 to <8 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b } @@ -678,7 +678,7 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a = trunc <16 x i32> %0 to <16 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b } @@ -765,7 +765,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <32 x i16> %0 to <32 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b } @@ -845,7 +845,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <64 x i8> %0 to <64 x i1> - %b = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b } @@ -905,7 +905,7 @@ ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) ret i1 %b } @@ -961,7 +961,7 @@ ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } @@ -1014,7 +1014,7 @@ ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = icmp eq <8 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b } @@ -1062,7 +1062,7 @@ ; AVX512VL-NEXT: setb %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b } @@ -1141,7 +1141,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <4 x i64> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } @@ -1207,7 +1207,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <8 x i32> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b } @@ -1274,7 +1274,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <16 x i16> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b } @@ -1354,7 +1354,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <32 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b } @@ -1447,7 +1447,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <8 x i64> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b } @@ -1507,7 +1507,7 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a = icmp eq <16 x i32> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b } @@ -1595,7 +1595,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <32 x i16> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b } @@ -1686,13 +1686,13 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <64 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> %a) + %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b } -declare i1 @llvm.experimental.vector.reduce.and.v2i1(<2 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1>) -declare i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1>) +declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>) +declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>) +declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>) +declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>) +declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -28,7 +28,7 @@ ; AVX-NEXT: testq %rax, %rax ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a0) %2 = icmp eq i64 %1, 0 ret i1 %2 } @@ -79,7 +79,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a0) %2 = icmp ne i64 %1, 0 ret i1 %2 } @@ -136,7 +136,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %a0) %2 = icmp eq i64 %1, 0 ret i1 %2 } @@ -202,7 +202,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %a0) %2 = icmp ne i64 %1, 0 ret i1 %2 } @@ -229,7 +229,7 @@ ; AVX-NEXT: testl %eax, %eax ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a0) %2 = icmp eq i32 %1, 0 ret i1 %2 } @@ -256,7 +256,7 @@ ; AVX-NEXT: testl %eax, %eax ; AVX-NEXT: setne %al ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a0) %2 = icmp ne i32 %1, 0 ret i1 %2 } @@ -315,7 +315,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a0) %2 = icmp eq i32 %1, 0 ret i1 %2 } @@ -380,7 +380,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a0) %2 = icmp ne i32 %1, 0 ret i1 %2 } @@ -454,7 +454,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %a0) %2 = icmp eq i32 %1, 0 ret i1 %2 } @@ -482,7 +482,7 @@ ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a0) %2 = icmp eq i16 %1, 0 ret i1 %2 } @@ -510,7 +510,7 @@ ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: setne %al ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a0) %2 = icmp ne i16 %1, 0 ret i1 %2 } @@ -542,7 +542,7 @@ ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a0) %2 = icmp eq i16 %1, 0 ret i1 %2 } @@ -610,7 +610,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %a0) %2 = icmp ne i16 %1, 0 ret i1 %2 } @@ -684,7 +684,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %a0) %2 = icmp eq i16 %1, 0 ret i1 %2 } @@ -767,7 +767,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %a0) %2 = icmp ne i16 %1, 0 ret i1 %2 } @@ -795,7 +795,7 @@ ; AVX-NEXT: testb %al, %al ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 } @@ -824,7 +824,7 @@ ; AVX-NEXT: testb %al, %al ; AVX-NEXT: setne %al ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a0) %2 = icmp ne i8 %1, 0 ret i1 %2 } @@ -857,7 +857,7 @@ ; AVX-NEXT: testb %al, %al ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 } @@ -894,7 +894,7 @@ ; AVX-NEXT: testb %al, %al ; AVX-NEXT: setne %al ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a0) %2 = icmp ne i8 %1, 0 ret i1 %2 } @@ -971,7 +971,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 } @@ -1054,7 +1054,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %a0) %2 = icmp ne i8 %1, 0 ret i1 %2 } @@ -1146,33 +1146,33 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 } -declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -24,7 +24,7 @@ ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a0) ret i64 %1 } @@ -66,7 +66,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a0) ret i64 %1 } @@ -114,7 +114,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %a0) ret i64 %1 } @@ -171,7 +171,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %a0) ret i64 %1 } @@ -193,7 +193,7 @@ ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -215,7 +215,7 @@ ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a0) ret i32 %1 } @@ -265,7 +265,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a0) ret i32 %1 } @@ -321,7 +321,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a0) ret i32 %1 } @@ -386,7 +386,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %a0) ret i32 %1 } @@ -411,7 +411,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -436,7 +436,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a0) ret i16 %1 } @@ -465,7 +465,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -528,7 +528,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -597,7 +597,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -675,7 +675,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -700,7 +700,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -726,7 +726,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -756,7 +756,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -790,7 +790,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -862,7 +862,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -940,7 +940,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -1027,32 +1027,32 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %a0) ret i8 %1 } -declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -53,7 +53,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -112,7 +112,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -185,7 +185,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -268,7 +268,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -313,7 +313,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -367,7 +367,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -435,7 +435,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -513,7 +513,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -558,7 +558,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -612,7 +612,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -680,7 +680,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -758,7 +758,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -801,7 +801,7 @@ ; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -853,7 +853,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -912,7 +912,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -982,7 +982,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -1021,7 +1021,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -1069,7 +1069,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -1124,7 +1124,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -1189,7 +1189,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } @@ -1228,7 +1228,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -1276,7 +1276,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -1331,7 +1331,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -1396,16 +1396,16 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>) +declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) +declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>) +declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll @@ -39,7 +39,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -90,7 +90,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -176,7 +176,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -327,7 +327,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -367,7 +367,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float 0.0, <2 x float> %a0) + %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 } @@ -422,7 +422,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.0, <4 x float> %a0) + %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 } @@ -512,7 +512,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.0, <8 x float> %a0) + %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) ret float %1 } @@ -667,7 +667,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.0, <16 x float> %a0) + %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) ret float %1 } @@ -699,7 +699,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float undef, <2 x float> %a0) + %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0) ret float %1 } @@ -746,7 +746,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float undef, <4 x float> %a0) + %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0) ret float %1 } @@ -828,7 +828,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float undef, <8 x float> %a0) + %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0) ret float %1 } @@ -975,7 +975,7 @@ ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float undef, <16 x float> %a0) + %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0) ret float %1 } @@ -1004,7 +1004,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -1042,7 +1042,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -1101,7 +1101,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -1229,7 +1229,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -1261,7 +1261,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double 0.0, <2 x double> %a0) + %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } @@ -1302,7 +1302,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.0, <4 x double> %a0) + %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) ret double %1 } @@ -1364,7 +1364,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double 0.0, <8 x double> %a0) + %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) ret double %1 } @@ -1467,7 +1467,7 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double 0.0, <16 x double> %a0) + %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) ret double %1 } @@ -1493,7 +1493,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double undef, <2 x double> %a0) + %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } @@ -1528,7 +1528,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double undef, <4 x double> %a0) + %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0) ret double %1 } @@ -1584,7 +1584,7 @@ ; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double undef, <8 x double> %a0) + %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0) ret double %1 } @@ -1681,16 +1681,16 @@ ; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double undef, <16 x double> %a0) + %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.v2.fadd.f64.v16f64(double, <16 x double>) +declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) +declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>) +declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll @@ -39,7 +39,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0) ret float %1 } @@ -78,7 +78,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0) ret float %1 } @@ -125,7 +125,7 @@ ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0) ret float %1 } @@ -179,7 +179,7 @@ ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0) ret float %1 } @@ -206,7 +206,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0) ret double %1 } @@ -236,7 +236,7 @@ ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0) ret double %1 } @@ -271,7 +271,7 @@ ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0) ret double %1 } @@ -313,16 +313,16 @@ ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) +declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) -declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) +declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -35,7 +35,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a0) ret float %1 } @@ -84,7 +84,7 @@ ; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0) ret float %1 } @@ -155,7 +155,7 @@ ; AVX512-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0) ret float %1 } @@ -247,7 +247,7 @@ ; AVX512-NEXT: vmaxss %xmm8, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0) ret float %1 } @@ -274,7 +274,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a0) + %1 = call nnan double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a0) ret double %1 } @@ -316,7 +316,7 @@ ; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double> %a0) + %1 = call nnan double @llvm.vector.reduce.fmax.v3f64(<3 x double> %a0) ret double %1 } @@ -350,7 +350,7 @@ ; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) + %1 = call nnan double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0) ret double %1 } @@ -395,7 +395,7 @@ ; AVX512-NEXT: vmaxsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0) + %1 = call nnan double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0) ret double %1 } @@ -447,7 +447,7 @@ ; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) + %1 = call nnan double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0) ret double %1 } @@ -511,18 +511,18 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call nnan half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %a0) + %1 = call nnan half @llvm.vector.reduce.fmax.v2f16(<2 x half> %a0) ret half %1 } -declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) +declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) -declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v3f64(<3 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmax.v3f64(<3 x double>) +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) +declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) -declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>) +declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -14,7 +14,7 @@ ; ALL-LABEL: test_v1f32: ; ALL: # %bb.0: ; ALL-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float> %a0) + %1 = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a0) ret float %1 } @@ -62,7 +62,7 @@ ; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %a0) + %1 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a0) ret float %1 } @@ -133,7 +133,7 @@ ; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float> %a0) + %1 = call float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a0) ret float %1 } @@ -230,7 +230,7 @@ ; AVX512-NEXT: vmaxss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %a0) + %1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a0) ret float %1 } @@ -401,7 +401,7 @@ ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float> %a0) + %1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0) ret float %1 } @@ -661,7 +661,7 @@ ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float> %a0) + %1 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a0) ret float %1 } @@ -700,7 +700,7 @@ ; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double> %a0) + %1 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a0) ret double %1 } @@ -774,7 +774,7 @@ ; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double> %a0) + %1 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0) ret double %1 } @@ -922,7 +922,7 @@ ; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double> %a0) + %1 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a0) ret double %1 } @@ -1091,18 +1091,18 @@ ; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double> %a0) + %1 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fmax.v1f32(<1 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v3f32(<3 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmax.v16f32(<16 x float>) +declare float @llvm.vector.reduce.fmax.v1f32(<1 x float>) +declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmax.v3f32(<3 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) -declare double @llvm.experimental.vector.reduce.fmax.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v8f64(<8 x double>) -declare double @llvm.experimental.vector.reduce.fmax.v16f64(<16 x double>) +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) +declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -14,7 +14,7 @@ ; ALL-LABEL: test_v1f32: ; ALL: # %bb.0: ; ALL-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a0) ret float %1 } @@ -43,7 +43,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0) ret float %1 } @@ -84,7 +84,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a0) ret float %1 } @@ -133,7 +133,7 @@ ; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a0) ret float %1 } @@ -204,7 +204,7 @@ ; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0) ret float %1 } @@ -296,7 +296,7 @@ ; AVX512-NEXT: vminss %xmm8, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0) + %1 = call nnan float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a0) ret float %1 } @@ -323,7 +323,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0) + %1 = call nnan double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0) ret double %1 } @@ -357,7 +357,7 @@ ; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0) + %1 = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a0) ret double %1 } @@ -402,7 +402,7 @@ ; AVX512-NEXT: vminsd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) + %1 = call nnan double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0) ret double %1 } @@ -454,7 +454,7 @@ ; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call nnan double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0) + %1 = call nnan double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a0) ret double %1 } @@ -518,20 +518,20 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call nnan half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %a0) + %1 = call nnan half @llvm.vector.reduce.fmin.v2f16(<2 x half> %a0) ret half %1 } -declare float @llvm.experimental.vector.reduce.fmin.v1f32(<1 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v3f32(<3 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>) +declare float @llvm.vector.reduce.fmin.v1f32(<1 x float>) +declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmin.v3f32(<3 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>) -declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) +declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>) -declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>) +declare half @llvm.vector.reduce.fmin.v2f16(<2 x half>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -54,7 +54,7 @@ ; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %a0) + %1 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a0) ret float %1 } @@ -151,7 +151,7 @@ ; AVX512-NEXT: vminss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %a0) + %1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a0) ret float %1 } @@ -322,7 +322,7 @@ ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float> %a0) + %1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0) ret float %1 } @@ -582,7 +582,7 @@ ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float> %a0) + %1 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a0) ret float %1 } @@ -621,7 +621,7 @@ ; AVX512-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovapd %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double> %a0) + %1 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a0) ret double %1 } @@ -691,7 +691,7 @@ ; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double> %a0) + %1 = call double @llvm.vector.reduce.fmin.v3f64(<3 x double> %a0) ret double %1 } @@ -765,7 +765,7 @@ ; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double> %a0) + %1 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a0) ret double %1 } @@ -913,7 +913,7 @@ ; AVX512VL-NEXT: vmovsd %xmm8, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double> %a0) + %1 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a0) ret double %1 } @@ -1082,17 +1082,17 @@ ; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double> %a0) + %1 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>) -declare float @llvm.experimental.vector.reduce.fmin.v16f32(<16 x float>) +declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>) -declare double @llvm.experimental.vector.reduce.fmin.v2f64(<2 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v3f64(<3 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v4f64(<4 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v8f64(<8 x double>) -declare double @llvm.experimental.vector.reduce.fmin.v16f64(<16 x double>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmin.v3f64(<3 x double>) +declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) +declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) +declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -39,7 +39,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -82,7 +82,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -133,7 +133,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -191,7 +191,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -225,7 +225,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -265,7 +265,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -313,7 +313,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -368,7 +368,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -402,7 +402,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -442,7 +442,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -490,7 +490,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -545,7 +545,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0) + %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -575,7 +575,7 @@ ; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -608,7 +608,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -646,7 +646,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -691,7 +691,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -719,7 +719,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -750,7 +750,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -786,7 +786,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -828,7 +828,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } @@ -856,7 +856,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -887,7 +887,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -923,7 +923,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -965,16 +965,16 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call fast double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0) + %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>) +declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>) +declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) +declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>) +declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll @@ -38,7 +38,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float %a0, <2 x float> %a1) + %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 } @@ -89,7 +89,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %a0, <4 x float> %a1) + %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float %a0, <8 x float> %a1) + %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1) ret float %1 } @@ -326,7 +326,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float %a0, <16 x float> %a1) + %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1) ret float %1 } @@ -360,7 +360,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float 1.0, <2 x float> %a0) + %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) ret float %1 } @@ -407,7 +407,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.0, <4 x float> %a0) + %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) ret float %1 } @@ -489,7 +489,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float 1.0, <8 x float> %a0) + %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) ret float %1 } @@ -636,7 +636,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float 1.0, <16 x float> %a0) + %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) ret float %1 } @@ -668,7 +668,7 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float undef, <2 x float> %a0) + %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0) ret float %1 } @@ -715,7 +715,7 @@ ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float undef, <4 x float> %a0) + %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0) ret float %1 } @@ -797,7 +797,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float undef, <8 x float> %a0) + %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0) ret float %1 } @@ -944,7 +944,7 @@ ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float undef, <16 x float> %a0) + %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0) ret float %1 } @@ -973,7 +973,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double %a0, <2 x double> %a1) + %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } @@ -1011,7 +1011,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double %a0, <4 x double> %a1) + %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1) ret double %1 } @@ -1070,7 +1070,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double %a0, <8 x double> %a1) + %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1) ret double %1 } @@ -1198,7 +1198,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double %a0, <16 x double> %a1) + %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1) ret double %1 } @@ -1226,7 +1226,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double 1.0, <2 x double> %a0) + %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) ret double %1 } @@ -1263,7 +1263,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double 1.0, <4 x double> %a0) + %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) ret double %1 } @@ -1321,7 +1321,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double 1.0, <8 x double> %a0) + %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) ret double %1 } @@ -1419,7 +1419,7 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double 1.0, <16 x double> %a0) + %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) ret double %1 } @@ -1445,7 +1445,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double undef, <2 x double> %a0) + %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } @@ -1480,7 +1480,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double undef, <4 x double> %a0) + %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0) ret double %1 } @@ -1536,7 +1536,7 @@ ; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double undef, <8 x double> %a0) + %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0) ret double %1 } @@ -1633,16 +1633,16 @@ ; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double undef, <16 x double> %a0) + %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0) ret double %1 } -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>) -declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>) +declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v2f64(double, <2 x double>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v4f64(double, <4 x double>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v8f64(double, <8 x double>) -declare double @llvm.experimental.vector.reduce.v2.fmul.f64.v16f64(double, <16 x double>) +declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) +declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>) +declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -85,7 +85,7 @@ ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a0) ret i64 %1 } @@ -231,7 +231,7 @@ ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %a0) ret i64 %1 } @@ -443,7 +443,7 @@ ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %a0) ret i64 %1 } @@ -763,7 +763,7 @@ ; AVX512DQVL-NEXT: vmovq %xmm0, %rax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> %a0) ret i64 %1 } @@ -799,7 +799,7 @@ ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -841,7 +841,7 @@ ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a0) ret i32 %1 } @@ -905,7 +905,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a0) ret i32 %1 } @@ -983,7 +983,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %a0) ret i32 %1 } @@ -1086,7 +1086,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> %a0) ret i32 %1 } @@ -1119,7 +1119,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -1154,7 +1154,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %a0) ret i16 %1 } @@ -1195,7 +1195,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -1258,7 +1258,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1380,7 +1380,7 @@ ; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1522,7 +1522,7 @@ ; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1555,7 +1555,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1607,7 +1607,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -1668,7 +1668,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -1842,7 +1842,7 @@ ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -2051,7 +2051,7 @@ ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -2325,7 +2325,7 @@ ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2685,32 +2685,32 @@ ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> %a0) ret i8 %1 } -declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -57,7 +57,7 @@ ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a) ret i1 %b } @@ -107,7 +107,7 @@ ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a) ret i1 %b } @@ -169,7 +169,7 @@ ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = trunc <8 x i8> %0 to <8 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b } @@ -198,7 +198,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: retq %a = trunc <16 x i8> %0 to <16 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b } @@ -253,7 +253,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <4 x i64> %0 to <4 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a) ret i1 %b } @@ -338,7 +338,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <8 x i32> %0 to <8 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b } @@ -407,7 +407,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <16 x i16> %0 to <16 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b } @@ -479,7 +479,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <32 x i8> %0 to <32 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a) ret i1 %b } @@ -580,7 +580,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <8 x i64> %0 to <8 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b } @@ -661,7 +661,7 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a = trunc <16 x i32> %0 to <16 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b } @@ -748,7 +748,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <32 x i16> %0 to <32 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a) ret i1 %b } @@ -828,7 +828,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <64 x i8> %0 to <64 x i1> - %b = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> %a) ret i1 %b } @@ -894,7 +894,7 @@ ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a) ret i1 %b } @@ -945,7 +945,7 @@ ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a) ret i1 %b } @@ -998,7 +998,7 @@ ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = icmp eq <8 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b } @@ -1046,7 +1046,7 @@ ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b } @@ -1129,7 +1129,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <4 x i64> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a) ret i1 %b } @@ -1197,7 +1197,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <8 x i32> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b } @@ -1264,7 +1264,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <16 x i16> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b } @@ -1341,7 +1341,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <32 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a) ret i1 %b } @@ -1441,7 +1441,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <8 x i64> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) ret i1 %b } @@ -1499,7 +1499,7 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a = icmp eq <16 x i32> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b } @@ -1587,7 +1587,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <32 x i16> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a) ret i1 %b } @@ -1680,13 +1680,13 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <64 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> %a) + %b = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> %a) ret i1 %b } -declare i1 @llvm.experimental.vector.reduce.or.v2i1(<2 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1>) -declare i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1>) +declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>) +declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>) +declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>) +declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>) +declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -31,7 +31,7 @@ ; AVX-NEXT: vptest %xmm0, %xmm0 ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0) %2 = icmp eq i64 %1, 0 ret i1 %2 } @@ -60,7 +60,7 @@ ; AVX-NEXT: setne %al ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0) %2 = icmp ne i64 %1, 0 ret i1 %2 } @@ -111,7 +111,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %a0) %2 = icmp eq i64 %1, 0 ret i1 %2 } @@ -175,7 +175,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %a0) %2 = icmp ne i64 %1, 0 ret i1 %2 } @@ -198,7 +198,7 @@ ; AVX-NEXT: testq %rax, %rax ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a0) %2 = icmp eq i32 %1, 0 ret i1 %2 } @@ -224,7 +224,7 @@ ; AVX-NEXT: vptest %xmm0, %xmm0 ; AVX-NEXT: setne %al ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a0) %2 = icmp ne i32 %1, 0 ret i1 %2 } @@ -253,7 +253,7 @@ ; AVX-NEXT: sete %al ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = icmp eq i32 %1, 0 ret i1 %2 } @@ -304,7 +304,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %a0) %2 = icmp ne i32 %1, 0 ret i1 %2 } @@ -368,7 +368,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %a0) %2 = icmp eq i32 %1, 0 ret i1 %2 } @@ -391,7 +391,7 @@ ; AVX-NEXT: testl %eax, %eax ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a0) %2 = icmp eq i16 %1, 0 ret i1 %2 } @@ -410,7 +410,7 @@ ; AVX-NEXT: testq %rax, %rax ; AVX-NEXT: setne %al ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a0) %2 = icmp ne i16 %1, 0 ret i1 %2 } @@ -436,7 +436,7 @@ ; AVX-NEXT: vptest %xmm0, %xmm0 ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a0) %2 = icmp eq i16 %1, 0 ret i1 %2 } @@ -465,7 +465,7 @@ ; AVX-NEXT: setne %al ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0) %2 = icmp ne i16 %1, 0 ret i1 %2 } @@ -516,7 +516,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %a0) %2 = icmp eq i16 %1, 0 ret i1 %2 } @@ -580,7 +580,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %a0) %2 = icmp ne i16 %1, 0 ret i1 %2 } @@ -603,7 +603,7 @@ ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 } @@ -622,7 +622,7 @@ ; AVX-NEXT: testl %eax, %eax ; AVX-NEXT: setne %al ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a0) %2 = icmp ne i8 %1, 0 ret i1 %2 } @@ -641,7 +641,7 @@ ; AVX-NEXT: testq %rax, %rax ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 } @@ -667,7 +667,7 @@ ; AVX-NEXT: vptest %xmm0, %xmm0 ; AVX-NEXT: setne %al ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a0) %2 = icmp ne i8 %1, 0 ret i1 %2 } @@ -696,7 +696,7 @@ ; AVX-NEXT: sete %al ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 } @@ -747,7 +747,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %a0) %2 = icmp ne i8 %1, 0 ret i1 %2 } @@ -811,7 +811,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 } @@ -841,7 +841,7 @@ ; AVX-NEXT: vptest {{.*}}(%rip), %xmm0 ; AVX-NEXT: sete %al ; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0) %2 = trunc i64 %1 to i16 %3 = icmp eq i16 %2, 0 ret i1 %3 @@ -888,7 +888,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = and i32 %1, 2147483648 %3 = icmp eq i32 %2, 0 ret i1 %3 @@ -935,7 +935,7 @@ ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0) %2 = trunc i16 %1 to i8 %3 = icmp ne i8 %2, 0 ret i1 %3 @@ -1003,7 +1003,7 @@ ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %a0) %2 = and i8 %1, 1 %3 = icmp eq i8 %2, 0 ret i1 %3 @@ -1037,34 +1037,34 @@ ; AVX-NEXT: retq %2 = bitcast %struct.Box* %0 to <4 x i32>* %3 = load <4 x i32>, <4 x i32>* %2, align 4 - %4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %3) + %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %3) %5 = and i32 %4, 15 %6 = icmp eq i32 %5, 0 ret i1 %6 } -declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -24,7 +24,7 @@ ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a0) ret i64 %1 } @@ -66,7 +66,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0) ret i64 %1 } @@ -114,7 +114,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %a0) ret i64 %1 } @@ -171,7 +171,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %a0) ret i64 %1 } @@ -193,7 +193,7 @@ ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -215,7 +215,7 @@ ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a0) ret i32 %1 } @@ -265,7 +265,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) ret i32 %1 } @@ -321,7 +321,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %a0) ret i32 %1 } @@ -386,7 +386,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %a0) ret i32 %1 } @@ -411,7 +411,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -436,7 +436,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a0) ret i16 %1 } @@ -465,7 +465,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -528,7 +528,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -597,7 +597,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -675,7 +675,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -700,7 +700,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -726,7 +726,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -756,7 +756,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -790,7 +790,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -862,7 +862,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -940,7 +940,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -1027,32 +1027,32 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %a0) ret i8 %1 } -declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -83,7 +83,7 @@ ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a0) ret i64 %1 } @@ -209,7 +209,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %a0) ret i64 %1 } @@ -404,7 +404,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %a0) ret i64 %1 } @@ -731,7 +731,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %a0) ret i64 %1 } @@ -771,7 +771,7 @@ ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -819,7 +819,7 @@ ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a0) ret i32 %1 } @@ -891,7 +891,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a0) ret i32 %1 } @@ -981,7 +981,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %a0) ret i32 %1 } @@ -1104,7 +1104,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %a0) ret i32 %1 } @@ -1137,7 +1137,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -1172,7 +1172,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a0) ret i16 %1 } @@ -1216,7 +1216,7 @@ ; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -1280,7 +1280,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1354,7 +1354,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1445,7 +1445,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1491,7 +1491,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1547,7 +1547,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -1615,7 +1615,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -1685,7 +1685,7 @@ ; AVX512-NEXT: xorb $127, %al ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -1781,7 +1781,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1895,7 +1895,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2042,32 +2042,32 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %a0) ret i8 %1 } -declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -83,7 +83,7 @@ ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a0) ret i64 %1 } @@ -209,7 +209,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %a0) ret i64 %1 } @@ -404,7 +404,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %a0) ret i64 %1 } @@ -731,7 +731,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %a0) ret i64 %1 } @@ -771,7 +771,7 @@ ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -819,7 +819,7 @@ ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a0) ret i32 %1 } @@ -891,7 +891,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a0) ret i32 %1 } @@ -981,7 +981,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %a0) ret i32 %1 } @@ -1104,7 +1104,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %a0) ret i32 %1 } @@ -1137,7 +1137,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -1172,7 +1172,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a0) ret i16 %1 } @@ -1216,7 +1216,7 @@ ; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -1280,7 +1280,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1354,7 +1354,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1445,7 +1445,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1491,7 +1491,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1547,7 +1547,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -1615,7 +1615,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -1685,7 +1685,7 @@ ; AVX512-NEXT: xorb $-128, %al ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -1781,7 +1781,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1895,7 +1895,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2042,32 +2042,32 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %a0) ret i8 %1 } -declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -89,7 +89,7 @@ ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a0) ret i64 %1 } @@ -231,7 +231,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %a0) ret i64 %1 } @@ -453,7 +453,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %a0) ret i64 %1 } @@ -832,7 +832,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %a0) ret i64 %1 } @@ -875,7 +875,7 @@ ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -929,7 +929,7 @@ ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a0) ret i32 %1 } @@ -1010,7 +1010,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a0) ret i32 %1 } @@ -1115,7 +1115,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a0) ret i32 %1 } @@ -1265,7 +1265,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %a0) ret i32 %1 } @@ -1311,7 +1311,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -1361,7 +1361,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a0) ret i16 %1 } @@ -1422,7 +1422,7 @@ ; AVX512VL-NEXT: notl %eax ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512VL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -1505,7 +1505,7 @@ ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1602,7 +1602,7 @@ ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1721,7 +1721,7 @@ ; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1754,7 +1754,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1790,7 +1790,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -1832,7 +1832,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -1901,7 +1901,7 @@ ; AVX512VL-NEXT: notb %al ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -1994,7 +1994,7 @@ ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -2099,7 +2099,7 @@ ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2222,32 +2222,32 @@ ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %a0) ret i8 %1 } -declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -89,7 +89,7 @@ ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a0) ret i64 %1 } @@ -232,7 +232,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %a0) ret i64 %1 } @@ -456,7 +456,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %a0) ret i64 %1 } @@ -836,7 +836,7 @@ ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %a0) ret i64 %1 } @@ -879,7 +879,7 @@ ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -933,7 +933,7 @@ ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a0) ret i32 %1 } @@ -1014,7 +1014,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a0) ret i32 %1 } @@ -1119,7 +1119,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %a0) ret i32 %1 } @@ -1269,7 +1269,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %a0) ret i32 %1 } @@ -1315,7 +1315,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -1365,7 +1365,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a0) ret i16 %1 } @@ -1407,7 +1407,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -1467,7 +1467,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1539,7 +1539,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1632,7 +1632,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1665,7 +1665,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -1701,7 +1701,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -1743,7 +1743,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -1791,7 +1791,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -1859,7 +1859,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1937,7 +1937,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2032,32 +2032,32 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %a0) ret i8 %1 } -declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -57,7 +57,7 @@ ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a) ret i1 %b } @@ -107,7 +107,7 @@ ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a) ret i1 %b } @@ -172,7 +172,7 @@ ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = trunc <8 x i8> %0 to <8 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a) ret i1 %b } @@ -201,7 +201,7 @@ ; AVX512-NEXT: setnp %al ; AVX512-NEXT: retq %a = trunc <16 x i8> %0 to <16 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a) ret i1 %b } @@ -256,7 +256,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <4 x i64> %0 to <4 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a) ret i1 %b } @@ -345,7 +345,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <8 x i32> %0 to <8 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a) ret i1 %b } @@ -423,7 +423,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <16 x i16> %0 to <16 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a) ret i1 %b } @@ -504,7 +504,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <32 x i8> %0 to <32 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a) ret i1 %b } @@ -609,7 +609,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <8 x i64> %0 to <8 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a) ret i1 %b } @@ -717,7 +717,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <16 x i32> %0 to <16 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a) ret i1 %b } @@ -815,7 +815,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <32 x i16> %0 to <32 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a) ret i1 %b } @@ -912,7 +912,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = trunc <64 x i8> %0 to <64 x i1> - %b = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> %a) ret i1 %b } @@ -978,7 +978,7 @@ ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a) ret i1 %b } @@ -1029,7 +1029,7 @@ ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a) ret i1 %b } @@ -1082,7 +1082,7 @@ ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <8 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a) ret i1 %b } @@ -1136,7 +1136,7 @@ ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a) ret i1 %b } @@ -1219,7 +1219,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <4 x i64> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a) ret i1 %b } @@ -1288,7 +1288,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <8 x i32> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a) ret i1 %b } @@ -1366,7 +1366,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <16 x i16> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a) ret i1 %b } @@ -1454,7 +1454,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <32 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a) ret i1 %b } @@ -1557,7 +1557,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <8 x i64> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a) ret i1 %b } @@ -1643,7 +1643,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <16 x i32> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a) ret i1 %b } @@ -1743,7 +1743,7 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <32 x i16> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a) ret i1 %b } @@ -1853,13 +1853,13 @@ ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a = icmp eq <64 x i8> %0, zeroinitializer - %b = call i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1> %a) + %b = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> %a) ret i1 %b } -declare i1 @llvm.experimental.vector.reduce.xor.v2i1(<2 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v4i1(<4 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v8i1(<8 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v16i1(<16 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v32i1(<32 x i1>) -declare i1 @llvm.experimental.vector.reduce.xor.v64i1(<64 x i1>) +declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1>) +declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1>) +declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1>) +declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1>) +declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1>) +declare i1 @llvm.vector.reduce.xor.v64i1(<64 x i1>) diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -24,7 +24,7 @@ ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a0) ret i64 %1 } @@ -66,7 +66,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %a0) ret i64 %1 } @@ -114,7 +114,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %a0) ret i64 %1 } @@ -171,7 +171,7 @@ ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> %a0) + %1 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %a0) ret i64 %1 } @@ -193,7 +193,7 @@ ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -215,7 +215,7 @@ ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a0) ret i32 %1 } @@ -265,7 +265,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a0) ret i32 %1 } @@ -321,7 +321,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %a0) ret i32 %1 } @@ -386,7 +386,7 @@ ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> %a0) + %1 = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %a0) ret i32 %1 } @@ -411,7 +411,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -436,7 +436,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a0) ret i16 %1 } @@ -465,7 +465,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a0) ret i16 %1 } @@ -528,7 +528,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -597,7 +597,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -675,7 +675,7 @@ ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> %a0) + %1 = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -700,7 +700,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %a0) ret i8 %1 } @@ -726,7 +726,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a0) ret i8 %1 } @@ -756,7 +756,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a0) ret i8 %1 } @@ -790,7 +790,7 @@ ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a0) ret i8 %1 } @@ -862,7 +862,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -940,7 +940,7 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -1027,32 +1027,32 @@ ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> %a0) + %1 = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %a0) ret i8 %1 } -declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64>) +declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>) -declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32>) +declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>) +declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>) -declare i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16>) +declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>) +declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>) -declare i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8>) +declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>) +declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>) +declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>) +declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>) diff --git a/llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll b/llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll --- a/llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/experimental-reduce.ll @@ -5,9 +5,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -declare i32 @llvm.experimental.vector.reduce.add(<3 x i32>) -declare i32 @llvm.experimental.vector.reduce.and(<3 x i32>) -declare i32 @llvm.experimental.vector.reduce.or(<3 x i32>) +declare i32 @llvm.vector.reduce.add(<3 x i32>) +declare i32 @llvm.vector.reduce.and(<3 x i32>) +declare i32 @llvm.vector.reduce.or(<3 x i32>) ; CHECK-LABEL: @reduce_add define i32 @reduce_add() sanitize_memory { @@ -17,9 +17,9 @@ %o = load <3 x i32>, <3 x i32> *%p ; CHECK: [[O_SHADOW:%.*]] = load <3 x i32>, <3 x i32>* ; CHECK: [[O_ORIGIN:%.*]] = load i32, i32* -; CHECK: [[R_SHADOW:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]]) -; CHECK: [[R:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v3i32(<3 x i32> [[O]]) - %r = call i32 @llvm.experimental.vector.reduce.add(<3 x i32> %o) +; CHECK: [[R_SHADOW:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]]) +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[O]]) + %r = call i32 @llvm.vector.reduce.add(<3 x i32> %o) ; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls ; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls ; CHECK: ret i32 [[R]] @@ -35,11 +35,11 @@ ; CHECK: [[O_SHADOW:%.*]] = load <3 x i32>, <3 x i32>* ; CHECK: [[O_ORIGIN:%.*]] = load i32, i32* ; CHECK: [[O_SHADOW_1:%.*]] = or <3 x i32> [[O]], [[O_SHADOW]] -; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]] -; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]]) +; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]] +; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]]) ; CHECK: [[R_SHADOW:%.*]] = and i32 [[O_SHADOW_2]], [[O_SHADOW_3]] -; CHECK: [[R:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> [[O]]) - %r = call i32 @llvm.experimental.vector.reduce.and(<3 x i32> %o) +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O]]) + %r = call i32 @llvm.vector.reduce.and(<3 x i32> %o) ; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls ; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls ; CHECK: ret i32 [[R]] @@ -56,11 +56,11 @@ ; CHECK: [[O_ORIGIN:%.*]] = load i32, i32* ; CHECK: [[NOT_O:%.*]] = xor <3 x i32> [[O]], ; CHECK: [[O_SHADOW_1:%.*]] = or <3 x i32> [[NOT_O]], [[O_SHADOW]] -; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]] -; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]]) +; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]] +; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]]) ; CHECK: [[R_SHADOW:%.*]] = and i32 [[O_SHADOW_2]], [[O_SHADOW_3]] -; CHECK: [[R:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v3i32(<3 x i32> [[O]]) - %r = call i32 @llvm.experimental.vector.reduce.or(<3 x i32> %o) +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O]]) + %r = call i32 @llvm.vector.reduce.or(<3 x i32> %o) ; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls ; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls ; CHECK: ret i32 [[R]] diff --git a/llvm/test/Transforms/InstCombine/vector-reductions.ll b/llvm/test/Transforms/InstCombine/vector-reductions.ll --- a/llvm/test/Transforms/InstCombine/vector-reductions.ll +++ b/llvm/test/Transforms/InstCombine/vector-reductions.ll @@ -1,23 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>) -declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) declare void @use_f32(float) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) declare void @use_i32(i32) define float @diff_of_sums_v4f32(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc nsz <4 x float> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = call reassoc nsz float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[TMP1]]) ; CHECK-NEXT: [[R:%.*]] = fsub reassoc nsz float [[TMP2]], [[A1:%.*]] ; CHECK-NEXT: ret float [[R]] ; - %r0 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0) - %r1 = call float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1) + %r0 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) + %r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1) %r = fsub reassoc nsz float %r0, %r1 ret float %r } @@ -26,13 +26,13 @@ define float @diff_of_sums_v4f32_fmf(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_v4f32_fmf( -; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) -; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) +; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) +; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) ; CHECK-NEXT: [[R:%.*]] = fsub nnan ninf nsz float [[R0]], [[R1]] ; CHECK-NEXT: ret float [[R]] ; - %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0) - %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1) + %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1) %r = fsub ninf nnan nsz float %r0, %r1 ret float %r } @@ -41,15 +41,15 @@ define float @diff_of_sums_extra_use1(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_extra_use1( -; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) +; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) ; CHECK-NEXT: call void @use_f32(float [[R0]]) -; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) +; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) ; CHECK-NEXT: [[R:%.*]] = fsub fast float [[R0]], [[R1]] ; CHECK-NEXT: ret float [[R]] ; - %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0) + %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) call void @use_f32(float %r0) - %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1) + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1) %r = fsub fast float %r0, %r1 ret float %r } @@ -58,14 +58,14 @@ define float @diff_of_sums_extra_use2(float %a0, <4 x float> %v0, float %a1, <4 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_extra_use2( -; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) -; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) +; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) +; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float [[A1:%.*]], <4 x float> [[V1:%.*]]) ; CHECK-NEXT: call void @use_f32(float [[R1]]) ; CHECK-NEXT: [[R:%.*]] = fsub fast float [[R0]], [[R1]] ; CHECK-NEXT: ret float [[R]] ; - %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0) - %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a1, <4 x float> %v1) + %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a1, <4 x float> %v1) call void @use_f32(float %r1) %r = fsub fast float %r0, %r1 ret float %r @@ -75,13 +75,13 @@ define float @diff_of_sums_type_mismatch(float %a0, <4 x float> %v0, float %a1, <8 x float> %v1) { ; CHECK-LABEL: @diff_of_sums_type_mismatch( -; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) -; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float [[A1:%.*]], <8 x float> [[V1:%.*]]) +; CHECK-NEXT: [[R0:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float [[A0:%.*]], <4 x float> [[V0:%.*]]) +; CHECK-NEXT: [[R1:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float [[A1:%.*]], <8 x float> [[V1:%.*]]) ; CHECK-NEXT: [[R:%.*]] = fsub fast float [[R0]], [[R1]] ; CHECK-NEXT: ret float [[R]] ; - %r0 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %a0, <4 x float> %v0) - %r1 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float %a1, <8 x float> %v1) + %r0 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %v0) + %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a1, <8 x float> %v1) %r = fsub fast float %r0, %r1 ret float %r } @@ -89,11 +89,11 @@ define i32 @diff_of_sums_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @diff_of_sums_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP2]] ; - %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0) - %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1) + %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0) + %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1) %r = sub i32 %r0, %r1 ret i32 %r } @@ -102,15 +102,15 @@ define i32 @diff_of_sums_v4i32_extra_use1(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @diff_of_sums_v4i32_extra_use1( -; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]]) +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]]) ; CHECK-NEXT: call void @use_i32(i32 [[R0]]) -; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]]) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]]) ; CHECK-NEXT: [[R:%.*]] = sub i32 [[R0]], [[R1]] ; CHECK-NEXT: ret i32 [[R]] ; - %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0) + %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0) call void @use_i32(i32 %r0) - %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1) + %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1) %r = sub i32 %r0, %r1 ret i32 %r } @@ -119,14 +119,14 @@ define i32 @diff_of_sums_v4i32_extra_use2(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @diff_of_sums_v4i32_extra_use2( -; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]]) -; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]]) +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]]) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]]) ; CHECK-NEXT: call void @use_i32(i32 [[R1]]) ; CHECK-NEXT: [[R:%.*]] = sub i32 [[R0]], [[R1]] ; CHECK-NEXT: ret i32 [[R]] ; - %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0) - %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1) + %r0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v0) + %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1) call void @use_i32(i32 %r1) %r = sub i32 %r0, %r1 ret i32 %r @@ -136,13 +136,13 @@ define i32 @diff_of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @diff_of_sums_type_mismatch2( -; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[V0:%.*]]) -; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]]) +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V0:%.*]]) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]]) ; CHECK-NEXT: [[R:%.*]] = sub i32 [[R0]], [[R1]] ; CHECK-NEXT: ret i32 [[R]] ; - %r0 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %v0) - %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1) + %r0 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v0) + %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v1) %r = sub i32 %r0, %r1 ret i32 %r } diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll --- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll @@ -1,31 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instsimplify -S | FileCheck %s -declare i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a) -declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a) +declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a) +declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %a) +declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a) +declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %a) +declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a) +declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %a) +declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a) +declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %a) +declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a) +declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %a) +declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a) +declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %a) +declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a) +declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %a) +declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a) +declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a) +declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a) define i32 @add_0() { ; CHECK-LABEL: @add_0( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> zeroinitializer) + %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> zeroinitializer) ret i32 %x } @@ -33,7 +33,7 @@ ; CHECK-LABEL: @add_1( ; CHECK-NEXT: ret i32 8 ; - %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> ) ret i32 %x } @@ -41,7 +41,7 @@ ; CHECK-LABEL: @add_inc( ; CHECK-NEXT: ret i32 18 ; - %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> ) ret i32 %x } @@ -49,25 +49,25 @@ ; CHECK-LABEL: @add_1v( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> ) + %x = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> ) ret i32 %x } define i32 @add_undef() { ; CHECK-LABEL: @add_undef( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) + %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) ret i32 %x } define i32 @add_undef1() { ; CHECK-LABEL: @add_undef1( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> ) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> ) ret i32 %x } @@ -77,7 +77,7 @@ ; CHECK-LABEL: @mul_0( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> zeroinitializer) + %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> zeroinitializer) ret i32 %x } @@ -85,7 +85,7 @@ ; CHECK-LABEL: @mul_1( ; CHECK-NEXT: ret i32 1 ; - %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> ) ret i32 %x } @@ -93,7 +93,7 @@ ; CHECK-LABEL: @mul_inc( ; CHECK-NEXT: ret i32 40320 ; - %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> ) ret i32 %x } @@ -101,25 +101,25 @@ ; CHECK-LABEL: @mul_1v( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> ) + %x = call i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> ) ret i32 %x } define i32 @mul_undef() { ; CHECK-LABEL: @mul_undef( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) + %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) ret i32 %x } define i32 @mul_undef1() { ; CHECK-LABEL: @mul_undef1( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> ) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> ) ret i32 %x } @@ -128,7 +128,7 @@ ; CHECK-LABEL: @and_0( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> zeroinitializer) + %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> zeroinitializer) ret i32 %x } @@ -136,7 +136,7 @@ ; CHECK-LABEL: @and_1( ; CHECK-NEXT: ret i32 1 ; - %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> ) ret i32 %x } @@ -144,7 +144,7 @@ ; CHECK-LABEL: @and_inc( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> ) ret i32 %x } @@ -152,25 +152,25 @@ ; CHECK-LABEL: @and_1v( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> ) + %x = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> ) ret i32 %x } define i32 @and_undef() { ; CHECK-LABEL: @and_undef( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef) + %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) ret i32 %x } define i32 @and_undef1() { ; CHECK-LABEL: @and_undef1( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> ) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> ) ret i32 %x } @@ -179,7 +179,7 @@ ; CHECK-LABEL: @or_0( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> zeroinitializer) + %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> zeroinitializer) ret i32 %x } @@ -187,7 +187,7 @@ ; CHECK-LABEL: @or_1( ; CHECK-NEXT: ret i32 1 ; - %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> ) ret i32 %x } @@ -195,7 +195,7 @@ ; CHECK-LABEL: @or_inc( ; CHECK-NEXT: ret i32 -1 ; - %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> ) ret i32 %x } @@ -203,25 +203,25 @@ ; CHECK-LABEL: @or_1v( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> ) + %x = call i32 @llvm.vector.reduce.or.v1i32(<1 x i32> ) ret i32 %x } define i32 @or_undef() { ; CHECK-LABEL: @or_undef( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef) + %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) ret i32 %x } define i32 @or_undef1() { ; CHECK-LABEL: @or_undef1( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> ) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> ) ret i32 %x } @@ -230,7 +230,7 @@ ; CHECK-LABEL: @xor_0( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> zeroinitializer) + %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> zeroinitializer) ret i32 %x } @@ -238,7 +238,7 @@ ; CHECK-LABEL: @xor_1( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> ) ret i32 %x } @@ -246,7 +246,7 @@ ; CHECK-LABEL: @xor_inc( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> ) ret i32 %x } @@ -254,25 +254,25 @@ ; CHECK-LABEL: @xor_1v( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> ) + %x = call i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> ) ret i32 %x } define i32 @xor_undef() { ; CHECK-LABEL: @xor_undef( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef) + %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) ret i32 %x } define i32 @xor_undef1() { ; CHECK-LABEL: @xor_undef1( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> ) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> ) ret i32 %x } @@ -281,7 +281,7 @@ ; CHECK-LABEL: @smin_0( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> zeroinitializer) + %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> zeroinitializer) ret i32 %x } @@ -289,7 +289,7 @@ ; CHECK-LABEL: @smin_1( ; CHECK-NEXT: ret i32 1 ; - %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> ) ret i32 %x } @@ -297,7 +297,7 @@ ; CHECK-LABEL: @smin_inc( ; CHECK-NEXT: ret i32 -6 ; - %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> ) ret i32 %x } @@ -305,25 +305,25 @@ ; CHECK-LABEL: @smin_1v( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> ) + %x = call i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> ) ret i32 %x } define i32 @smin_undef() { ; CHECK-LABEL: @smin_undef( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> undef) + %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) ret i32 %x } define i32 @smin_undef1() { ; CHECK-LABEL: @smin_undef1( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> ) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> ) ret i32 %x } @@ -332,7 +332,7 @@ ; CHECK-LABEL: @smax_0( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> zeroinitializer) + %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> zeroinitializer) ret i32 %x } @@ -340,7 +340,7 @@ ; CHECK-LABEL: @smax_1( ; CHECK-NEXT: ret i32 1 ; - %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> ) ret i32 %x } @@ -348,7 +348,7 @@ ; CHECK-LABEL: @smax_inc( ; CHECK-NEXT: ret i32 8 ; - %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> ) ret i32 %x } @@ -356,25 +356,25 @@ ; CHECK-LABEL: @smax_1v( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> ) + %x = call i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> ) ret i32 %x } define i32 @smax_undef() { ; CHECK-LABEL: @smax_undef( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> undef) + %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) ret i32 %x } define i32 @smax_undef1() { ; CHECK-LABEL: @smax_undef1( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> ) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> ) ret i32 %x } @@ -383,7 +383,7 @@ ; CHECK-LABEL: @umin_0( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> zeroinitializer) + %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> zeroinitializer) ret i32 %x } @@ -391,7 +391,7 @@ ; CHECK-LABEL: @umin_1( ; CHECK-NEXT: ret i32 1 ; - %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> ) ret i32 %x } @@ -399,7 +399,7 @@ ; CHECK-LABEL: @umin_inc( ; CHECK-NEXT: ret i32 1 ; - %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> ) ret i32 %x } @@ -407,25 +407,25 @@ ; CHECK-LABEL: @umin_1v( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> ) + %x = call i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> ) ret i32 %x } define i32 @umin_undef() { ; CHECK-LABEL: @umin_undef( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> undef) + %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) ret i32 %x } define i32 @umin_undef1() { ; CHECK-LABEL: @umin_undef1( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> ) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> ) ret i32 %x } @@ -434,7 +434,7 @@ ; CHECK-LABEL: @umax_0( ; CHECK-NEXT: ret i32 0 ; - %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> zeroinitializer) + %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> zeroinitializer) ret i32 %x } @@ -442,7 +442,7 @@ ; CHECK-LABEL: @umax_1( ; CHECK-NEXT: ret i32 1 ; - %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> ) ret i32 %x } @@ -450,7 +450,7 @@ ; CHECK-LABEL: @umax_inc( ; CHECK-NEXT: ret i32 -3 ; - %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> ) ret i32 %x } @@ -458,24 +458,24 @@ ; CHECK-LABEL: @umax_1v( ; CHECK-NEXT: ret i32 10 ; - %x = call i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> ) + %x = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> ) ret i32 %x } define i32 @umax_undef() { ; CHECK-LABEL: @umax_undef( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> undef) + %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) ret i32 %x } define i32 @umax_undef1d() { ; CHECK-LABEL: @umax_undef1d( -; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> ) +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> ) ; CHECK-NEXT: ret i32 [[X]] ; - %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> ) + %x = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> ) ret i32 %x } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll @@ -8,7 +8,7 @@ ; Function Attrs: norecurse nounwind readonly define i32 @fn1() local_unnamed_addr #0 { ; Ensure that we don't emit reduction intrinsics for unsupported short reductions. -; CHECK-NOT: @llvm.experimental.vector.reduce +; CHECK-NOT: @llvm.vector.reduce entry: %0 = load i32, i32* @b, align 4, !tbaa !1 %cmp40 = icmp sgt i32 %0, 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -20,7 +20,7 @@ ; CHECK: add <16 x i8> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> ; CHECK: zext i8 [[Rdx]] to i32 ; define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { @@ -75,7 +75,7 @@ ; CHECK: add <8 x i16> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) { @@ -132,7 +132,7 @@ ; CHECK: add <8 x i16> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll @@ -628,7 +628,7 @@ ret void } -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll @@ -23,7 +23,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] @@ -77,7 +77,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] @@ -132,7 +132,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] @@ -186,7 +186,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -240,7 +240,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -294,7 +294,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -348,7 +348,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -402,7 +402,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -451,7 +451,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -508,7 +508,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -41,7 +41,7 @@ ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -129,7 +129,7 @@ ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP11]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -210,7 +210,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -281,7 +281,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -352,7 +352,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -423,7 +423,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -494,7 +494,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -565,7 +565,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -636,7 +636,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -703,7 +703,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -773,7 +773,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -843,7 +843,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -913,7 +913,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -183,7 +183,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] @@ -232,7 +232,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] @@ -282,7 +282,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] @@ -330,7 +330,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] @@ -379,7 +379,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP4]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] @@ -427,7 +427,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] @@ -664,7 +664,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] @@ -720,7 +720,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] @@ -778,7 +778,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] @@ -834,7 +834,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP6]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] @@ -890,7 +890,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP8]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] @@ -946,7 +946,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP6]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-fold-multiple-icmps.ll @@ -29,8 +29,8 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]]) -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll @@ -595,7 +595,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -665,7 +665,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -735,7 +735,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -805,7 +805,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll --- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll @@ -110,7 +110,7 @@ ; AVX-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 ; AVX-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; AVX: middle.block: -; AVX-NEXT: [[TMP8:%.*]] = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[PREDPHI]]) +; AVX-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[PREDPHI]]) ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 ; AVX-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]] ; AVX: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -59,7 +59,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP12]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <16 x i32> [[TMP13]], [[BIN_RDX11]] -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -97,7 +97,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] ; CHECK-NEXT: [[BIN_RDX19:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX20:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX19]] -; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]]) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX20]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -263,7 +263,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -450,7 +450,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP101]], [[TMP100]] ; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX7]] -; CHECK-NEXT: [[TMP105:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) +; CHECK-NEXT: [[TMP105:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -776,7 +776,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]] ; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]] -; CHECK-NEXT: [[TMP185:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[TMP185:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -953,7 +953,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]] ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1126,7 +1126,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 3072, 3072 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1477,7 +1477,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP149]], [[TMP148]] ; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]] -; CHECK-NEXT: [[TMP153:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[TMP153:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 2048, 2048 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1644,7 +1644,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1811,7 +1811,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1978,7 +1978,7 @@ ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] ; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -90,7 +90,7 @@ ; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP27]], [[TMP26]] -; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP7]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll @@ -27,7 +27,7 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]]) ; CHECK-NEXT: ret i8 [[TMP7]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -76,7 +76,7 @@ ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -148,7 +148,7 @@ ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -220,7 +220,7 @@ ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = call reassoc contract float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -62,7 +62,7 @@ ; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 ; CHECK-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP37]]) +; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP37]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 96 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -195,7 +195,7 @@ ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP15]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -27,13 +27,13 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 12 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD4]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD4]]) ; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD5]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD5]]) ; CHECK-NEXT: [[TMP13]] = add i32 [[TMP12]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD6]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD6]]) ; CHECK-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -15,7 +15,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 @@ -65,11 +65,11 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]]) ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]]) ; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], @@ -121,7 +121,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], 12 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 @@ -173,11 +173,11 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND2]]) ; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]]) ; CHECK-NEXT: [[TMP9]] = mul i32 [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], @@ -234,9 +234,9 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]]) ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[TMP8]] = add i32 [[TMP7]], [[TMP6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], @@ -291,9 +291,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]]) ; CHECK-NEXT: [[TMP7]] = mul i32 [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 @@ -346,7 +346,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 @@ -398,9 +398,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD1]]) ; CHECK-NEXT: [[TMP7]] = and i32 [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 @@ -453,7 +453,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[TMP6]] = or i32 [[TMP5]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 @@ -506,7 +506,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[TMP6]] = xor i32 [[TMP5]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 @@ -558,9 +558,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD1]]) ; CHECK-NEXT: [[TMP7]] = fadd float [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 @@ -612,9 +612,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP5:%.*]] = fmul float [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD1]]) ; CHECK-NEXT: [[TMP7]] = fmul float [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 @@ -663,7 +663,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt i32 [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP2]], i32 [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 @@ -711,7 +711,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt i32 [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP2]], i32 [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 @@ -765,7 +765,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !30 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -827,7 +827,7 @@ ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !32 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]]) +; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -950,11 +950,11 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]]) ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]]) ; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], @@ -1012,7 +1012,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !36 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]]) ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] @@ -1059,7 +1059,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[TMP6]] = and i32 [[TMP5]], [[TMP0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll @@ -10,7 +10,7 @@ ; CHECK: [[TMP24:%.*]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP23:%.*]], <4 x i32> zeroinitializer ; CHECK: [[TMP25]] = add <4 x i32> [[VEC_PHI]], [[TMP24]] ; CHECK: middle.block: -; CHECK: [[TMP27:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) +; CHECK: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) ; entry: br label %.lr.ph @@ -38,7 +38,7 @@ ; CHECK: [[TMP46:%.*]] = add <4 x i32> [[TMP45]], [[TMP43:%.*]] ; CHECK: [[TMP47]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] ; CHECK: middle.block: -; CHECK: [[TMP49:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP47]]) +; CHECK: [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP47]]) ; entry: br label %.lr.ph @@ -70,7 +70,7 @@ ; CHECK: [[TMP45:%.*]] = mul <4 x i32> [[TMP44]], [[TMP43:%.*]] ; CHECK: [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] ; CHECK: middle.block: -; CHECK: [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP46]]) +; CHECK: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP46]]) ; entry: br label %.lr.ph @@ -101,7 +101,7 @@ ; CHECK: [[TMP45:%.*]] = and <4 x i32> [[TMP44]], [[TMP43:%.*]] ; CHECK: [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] ; CHECK: middle.block: -; CHECK: [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP46]]) +; CHECK: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP46]]) ; entry: br label %for.body @@ -131,7 +131,7 @@ ; CHECK: [[TMP45:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP44:%.*]], <4 x i32> zeroinitializer ; CHECK: [[TMP46]] = or <4 x i32> [[VEC_PHI]], [[TMP45]] ; CHECK: middle.block: -; CHECK: [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP46]]) +; CHECK: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP46]]) ; entry: br label %for.body @@ -161,7 +161,7 @@ ; CHECK: [[TMP45:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP44:%.*]], <4 x i32> zeroinitializer ; CHECK: [[TMP46]] = xor <4 x i32> [[VEC_PHI]], [[TMP45]] ; CHECK: middle.block: -; CHECK: [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP46]]) +; CHECK: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP46]]) ; entry: br label %for.body @@ -192,7 +192,7 @@ ; CHECK: [[TMP45:%.*]] = fadd fast <4 x float> [[TMP44]], [[TMP43:%.*]] ; CHECK: [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]] ; CHECK: middle.block: -; CHECK: [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP46]]) +; CHECK: [[TMP48:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP46]]) ; entry: br label %for.body @@ -223,7 +223,7 @@ ; CHECK: [[TMP45:%.*]] = fmul fast <4 x float> [[TMP44]], [[TMP43:%.*]] ; CHECK: [[TMP46]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]] ; CHECK: middle.block: -; CHECK: [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP46]]) +; CHECK: [[TMP48:%.*]] = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP46]]) ; entry: br label %for.body @@ -254,7 +254,7 @@ ; CHECK: [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]] ; CHECK: [[TMP26]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]] ; CHECK: middle.block: -; CHECK: [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP26]]) +; CHECK: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP26]]) ; entry: br label %for.body @@ -283,7 +283,7 @@ ; CHECK: [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]] ; CHECK: [[TMP26]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]] ; CHECK: middle.block: -; CHECK: [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP26]]) +; CHECK: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP26]]) ; entry: br label %for.body diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -8,7 +8,7 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @ext_ext_or_reduction_v4i32( ; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[Z]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[Z]]) ; CHECK-NEXT: ret i32 [[TMP1]] ; %z = and <4 x i32> %x, %y @@ -74,7 +74,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) ; CHECK-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP8]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND6:%.*]] = zext i1 [[CMP5]] to i32 ; CHECK-NEXT: ret i32 [[COND6]] @@ -133,7 +133,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP3:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3]] to i32 ; CHECK-NEXT: ret i32 [[COND]] @@ -181,7 +181,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) ; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ole float [[TMP6]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND5:%.*]] = zext i1 [[CMP4]] to i32 ; CHECK-NEXT: ret i32 [[COND5]] @@ -240,7 +240,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP4]]) ; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3]] to i32 ; CHECK-NEXT: ret i32 [[COND]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -23,7 +23,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) ; CHECK-NEXT: ret i32 [[TMP10]] ; %tmp00 = lshr i32 %a, 15 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -17,7 +17,7 @@ ; DEFAULT: for.body: ; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> -; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) ; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]] ; DEFAULT-NEXT: br label [[FOR_BODY]] ; @@ -61,7 +61,7 @@ ; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6 ; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7 ; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP34]]) +; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]]) ; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], [[P17]] ; GATHER-NEXT: br label [[FOR_BODY]] ; @@ -153,7 +153,7 @@ ; DEFAULT: for.body: ; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> -; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) ; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5 ; DEFAULT-NEXT: br label [[FOR_BODY]] ; @@ -197,7 +197,7 @@ ; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6 ; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7 ; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP34]]) +; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]]) ; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], -5 ; GATHER-NEXT: br label [[FOR_BODY]] ; @@ -229,7 +229,7 @@ ; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> ; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) ; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] ; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] ; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -46,7 +46,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) ; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]] ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]] @@ -169,7 +169,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP5]], [[S_020]] ; CHECK-NEXT: [[CMP14:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]] ; CHECK-NEXT: br i1 [[CMP14]], label [[IF_END]], label [[FOR_END_LOOPEXIT:%.*]] @@ -285,7 +285,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <8 x i32> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) ; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP10]], [[S_047]] ; CHECK-NEXT: [[CMP83:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]] ; CHECK-NEXT: br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -244,7 +244,7 @@ ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], ; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) ; CHECK-NEXT: ret i32 [[TMP15]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll @@ -19,7 +19,7 @@ ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP10]], 1 ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], [[TMP7]] ; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll @@ -20,7 +20,7 @@ ; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32 ; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> , [[TMP5]] ; CHECK-NEXT: [[TMP9:%.*]] = ashr exact <4 x i64> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> [[TMP9]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP9]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i64 [[TMP10]], 0 ; CHECK-NEXT: [[OP_EXTRA1]] = add i64 [[OP_EXTRA]], [[TMP6]] ; CHECK-NEXT: br label [[LOOP]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -11,7 +11,7 @@ ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0:%.*]] ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] ; CHECK-NEXT: [[OP_EXTRA2:%.*]] = and i32 [[OP_EXTRA1]], [[TMP0]] @@ -62,7 +62,7 @@ ; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], ; FORCE_REDUCTION-NEXT: [[VAL_20:%.*]] = add i32 [[TMP2]], 1496 ; FORCE_REDUCTION-NEXT: [[VAL_34:%.*]] = add i32 [[TMP2]], 8555 -; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP3]]) +; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]]) ; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]] ; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]] ; FORCE_REDUCTION-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 ; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll @@ -84,7 +84,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 ; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01 ; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00 ; CHECK-NEXT: ret float [[R]] @@ -111,7 +111,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 ; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01 ; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP4]], float -1.000000e+00, float 1.000000e+00 ; CHECK-NEXT: ret float [[R]] @@ -138,7 +138,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 ; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP1]], 42 ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP4]], i32 -1, i32 1 ; CHECK-NEXT: ret i32 [[R]] @@ -169,7 +169,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 ; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]] -; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[CMP3WRONG]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[TMP5]], i32 -1, i32 1 ; CHECK-NEXT: ret i32 [[R]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -102,7 +102,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] ; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]] ; CHECK-NEXT: store float [[OP_EXTRA1]], float* @res, align 4 @@ -118,7 +118,7 @@ ; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] ; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) ; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] ; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]] ; THRESHOLD-NEXT: store float [[OP_EXTRA1]], float* @res, align 4 @@ -175,7 +175,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; CHECK-NEXT: store float [[TMP5]], float* @res, align 4 ; CHECK-NEXT: ret float [[TMP5]] @@ -187,7 +187,7 @@ ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; THRESHOLD-NEXT: store float [[TMP5]], float* @res, align 4 ; THRESHOLD-NEXT: ret float [[TMP5]] @@ -223,7 +223,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP5]] to i32 ; CHECK-NEXT: store i32 [[CONV4]], i32* @n, align 4 @@ -236,7 +236,7 @@ ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; THRESHOLD-NEXT: [[CONV4:%.*]] = fptosi float [[TMP5]] to i32 ; THRESHOLD-NEXT: store i32 [[CONV4]], i32* @n, align 4 @@ -390,8 +390,8 @@ ; CHECK-NEXT: [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; CHECK-NEXT: ret float [[OP_RDX]] ; @@ -448,8 +448,8 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47 ; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>* ; THRESHOLD-NEXT: [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4 -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP1]]) ; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; THRESHOLD-NEXT: ret float [[OP_RDX]] ; @@ -637,7 +637,7 @@ ; CHECK-NEXT: [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP1]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] ; CHECK-NEXT: ret float [[OP_EXTRA]] ; @@ -678,7 +678,7 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31 ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v32f32(float 0.000000e+00, <32 x float> [[TMP1]]) ; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] ; THRESHOLD-NEXT: ret float [[OP_EXTRA]] ; @@ -824,10 +824,10 @@ ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* ; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP5]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] @@ -873,10 +873,10 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 ; THRESHOLD-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* ; THRESHOLD-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4 -; THRESHOLD-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP7]]) -; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP5]]) +; THRESHOLD-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP7]]) +; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP5]]) ; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] ; THRESHOLD-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX1]], [[TMP1]] ; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] @@ -990,7 +990,7 @@ ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; CHECK-NEXT: ret float [[OP_EXTRA1]] @@ -1009,7 +1009,7 @@ ; THRESHOLD-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) ; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; THRESHOLD-NEXT: ret float [[OP_EXTRA1]] @@ -1060,7 +1060,7 @@ ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00 ; CHECK-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00 @@ -1081,7 +1081,7 @@ ; THRESHOLD-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) ; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00 ; THRESHOLD-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], 5.000000e+00 @@ -1138,7 +1138,7 @@ ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; CHECK-NEXT: ret float [[OP_EXTRA1]] @@ -1159,7 +1159,7 @@ ; THRESHOLD-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) ; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]] ; THRESHOLD-NEXT: ret float [[OP_EXTRA1]] @@ -1212,7 +1212,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]] ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]] ; CHECK-NEXT: ret i32 [[OP_EXTRA1]] @@ -1231,7 +1231,7 @@ ; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 ; THRESHOLD-NEXT: [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer ; THRESHOLD-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32> -; THRESHOLD-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) +; THRESHOLD-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) ; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]] ; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]] ; THRESHOLD-NEXT: ret i32 [[OP_EXTRA1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -12,7 +12,7 @@ define i32 @maxi8(i32) { ; CHECK-LABEL: @maxi8( ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]]) ; CHECK-NEXT: ret i32 [[TMP3]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 @@ -43,7 +43,7 @@ define i32 @maxi16(i32) { ; CHECK-LABEL: @maxi16( ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]]) ; CHECK-NEXT: ret i32 [[TMP3]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 @@ -98,7 +98,7 @@ define i32 @maxi32(i32) { ; CHECK-LABEL: @maxi32( ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> [[TMP2]]) ; CHECK-NEXT: ret i32 [[TMP3]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 @@ -758,7 +758,7 @@ ; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] ; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) ; AVX-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]] ; AVX-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] ; AVX-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]] @@ -776,7 +776,7 @@ ; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 ; THRESH-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; THRESH-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; THRESH-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) ; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0 ; THRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP3]], i32 1 ; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> undef, i32 [[TMP6]], i32 0 @@ -860,7 +860,7 @@ ; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 ; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) ; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] ; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] ; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] @@ -879,7 +879,7 @@ ; THRESH-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 ; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 ; THRESH-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; THRESH-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; THRESH-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) ; THRESH-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] ; THRESH-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] ; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -37,7 +37,7 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; CHECK-NEXT: [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]] ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_033]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] @@ -70,7 +70,7 @@ ; STORE-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* ; STORE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 ; STORE-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], -; STORE-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; STORE-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; STORE-NEXT: [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]] ; STORE-NEXT: [[INC]] = add nsw i64 [[I_033]], 1 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]] @@ -164,7 +164,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) ; CHECK-NEXT: [[MUL21]] = fmul float [[SUM_039]], [[TMP6]] ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_040]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] @@ -202,7 +202,7 @@ ; STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* ; STORE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; STORE-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]] -; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) ; STORE-NEXT: [[MUL21]] = fmul float [[SUM_039]], [[TMP6]] ; STORE-NEXT: [[INC]] = add nsw i64 [[I_040]], 1 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] @@ -326,7 +326,7 @@ ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]] ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4 ; CHECK-NEXT: [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP6]]) ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]] ; CHECK-NEXT: [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]] ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_083]], 1 @@ -383,7 +383,7 @@ ; STORE-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]] ; STORE-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4 ; STORE-NEXT: [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]] -; STORE-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP6]]) +; STORE-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP6]]) ; STORE-NEXT: [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]] ; STORE-NEXT: [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]] ; STORE-NEXT: [[INC]] = add nsw i64 [[I_083]], 1 @@ -520,7 +520,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) ; CHECK-NEXT: [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]] ; CHECK-NEXT: [[INC]] = add nsw i64 [[I_043]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] @@ -558,7 +558,7 @@ ; STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* ; STORE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; STORE-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]] -; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) ; STORE-NEXT: [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]] ; STORE-NEXT: [[INC]] = add nsw i64 [[I_043]], 1 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] @@ -1015,7 +1015,7 @@ ; STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* ; STORE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; STORE-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]] -; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) +; STORE-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) ; STORE-NEXT: store float [[TMP6]], float* [[C_ADDR_038]], align 4 ; STORE-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_039]], 1 @@ -1090,7 +1090,7 @@ ; STORE-LABEL: @float_red_example4( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16 -; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP0]]) +; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP0]]) ; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; @@ -1130,7 +1130,7 @@ ; STORE-LABEL: @float_red_example8( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16 -; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) +; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) ; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; @@ -1194,7 +1194,7 @@ ; STORE-LABEL: @float_red_example16( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16 -; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) +; STORE-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) ; STORE-NEXT: store float [[TMP1]], float* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; @@ -1250,7 +1250,7 @@ ; STORE-LABEL: @i32_red_example4( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 -; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) ; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; @@ -1290,7 +1290,7 @@ ; STORE-LABEL: @i32_red_example8( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 -; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) ; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; @@ -1354,7 +1354,7 @@ ; STORE-LABEL: @i32_red_example16( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16 -; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> [[TMP0]]) +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]]) ; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; @@ -1466,7 +1466,7 @@ ; STORE-LABEL: @i32_red_example32( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16 -; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> [[TMP0]]) +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]]) ; STORE-NEXT: store i32 [[TMP1]], i32* [[RES:%.*]], align 16 ; STORE-NEXT: ret void ; @@ -1544,14 +1544,14 @@ ; CHECK-LABEL: @i32_red_call( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) ; CHECK-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]]) ; CHECK-NEXT: ret void ; ; STORE-LABEL: @i32_red_call( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 -; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) ; STORE-NEXT: [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]]) ; STORE-NEXT: ret void ; @@ -1579,7 +1579,7 @@ ; CHECK-LABEL: @i32_red_invoke( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) ; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]]) ; CHECK-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]] ; CHECK: exception: @@ -1592,7 +1592,7 @@ ; STORE-LABEL: @i32_red_invoke( ; STORE-NEXT: entry: ; STORE-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 -; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) +; STORE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]]) ; STORE-NEXT: [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]]) ; STORE-NEXT: to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]] ; STORE: exception: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: @Foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, <32 x i8>* [[__V:%.*]], align 32 -; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> [[TMP0]]) ; CHECK-NEXT: ret i8 [[TMP1]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll @@ -80,7 +80,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[T4:%.*]] = load i32, i32* [[X4]], align 4 ; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[X5]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[T4]] ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[T4]] ; CHECK-NEXT: [[C012345:%.*]] = icmp sgt i32 [[TMP5]], [[T5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll @@ -35,7 +35,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) ; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: @@ -124,7 +124,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) ; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: @@ -230,7 +230,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[Q]] to <8 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[REORDER_SHUFFLE]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) ; CHECK-NEXT: [[OP_EXTRA]] = add i32 [[TMP5]], [[SUM]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll @@ -26,7 +26,7 @@ ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP2]] ; entry: @@ -74,7 +74,7 @@ ; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]]) +; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]]) ; AVX-NEXT: ret i32 [[TMP2]] ; ; SSE-LABEL: @test_mul( @@ -148,7 +148,7 @@ ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP2]] ; entry: @@ -196,7 +196,7 @@ ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP2]] ; entry: @@ -244,7 +244,7 @@ ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP2]] ; entry: @@ -284,7 +284,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], ; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]] ; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: ret i32 [[TMP7]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll @@ -38,7 +38,7 @@ ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) ; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP15]], [[A_088]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -18,14 +18,14 @@ ; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef ; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 ; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]] ; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]]) ; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef ; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef ; CHECK-NEXT: [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: @dotf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP0]]) ; CHECK-NEXT: ret float [[TMP1]] ; entry: @@ -33,7 +33,7 @@ ; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32 ; CHECK-NEXT: [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[TMP2]]) ; CHECK-NEXT: ret double [[TMP3]] ; entry: @@ -63,7 +63,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) ; CHECK-NEXT: ret float [[TMP3]] ; entry: @@ -93,7 +93,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.experimental.vector.reduce.v2.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.000000e+00, <4 x double> [[TMP2]]) ; CHECK-NEXT: ret double [[TMP3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll @@ -37,7 +37,7 @@ ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) ; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP15]], [[A_088]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll @@ -16,7 +16,7 @@ ; CHECK-NEXT: [[DOTSROA_RAW_IDX_7:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTSROA_CAST_4]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], undef ; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 undef ; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[OP_EXTRA]], undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll @@ -55,7 +55,7 @@ ; CHECK-NEXT: [[TMP38:%.*]] = icmp slt <4 x i32> [[TMP37]], zeroinitializer ; CHECK-NEXT: [[TMP39:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP37]] ; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP38]], <4 x i32> [[TMP39]], <4 x i32> [[TMP37]] -; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP40]]) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP40]]) ; CHECK-NEXT: [[TMP42:%.*]] = icmp slt i32 [[TMP41]], [[TMP32]] ; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP41]], i32 [[TMP32]] ; CHECK-NEXT: [[TMP44:%.*]] = icmp slt i32 [[TMP43]], [[B_0]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] ; entry: @@ -67,7 +67,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] ; entry: @@ -121,7 +121,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] ; entry: diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td @@ -291,14 +291,14 @@ // LLVM vector reduction over a single vector. class LLVM_VectorReduction - : LLVM_OneResultIntrOp<"experimental.vector.reduce." # mnem, + : LLVM_OneResultIntrOp<"vector.reduce." # mnem, [], [0], [NoSideEffect]>, Arguments<(ins LLVM_Type)>; // LLVM vector reduction over a single vector, with an initial value, // and with permission to reassociate the reduction operations. -class LLVM_VectorReductionV2 - : LLVM_OpBase + : LLVM_OpBase, Results<(outs LLVM_Type:$res)>, Arguments<(ins LLVM_Type, LLVM_Type, @@ -307,7 +307,7 @@ llvm::Module *module = builder.GetInsertBlock()->getModule(); llvm::Function *fn = llvm::Intrinsic::getDeclaration( module, - llvm::Intrinsic::experimental_vector_reduce_v2_}] # mnem # [{, + llvm::Intrinsic::vector_reduce_}] # mnem # [{, { }] # StrJoin.lst, ListIntSubst.lst)>.result # [{ diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -889,20 +889,20 @@ // Vector Reductions. // -def LLVM_experimental_vector_reduce_add : LLVM_VectorReduction<"add">; -def LLVM_experimental_vector_reduce_and : LLVM_VectorReduction<"and">; -def LLVM_experimental_vector_reduce_mul : LLVM_VectorReduction<"mul">; -def LLVM_experimental_vector_reduce_fmax : LLVM_VectorReduction<"fmax">; -def LLVM_experimental_vector_reduce_fmin : LLVM_VectorReduction<"fmin">; -def LLVM_experimental_vector_reduce_or : LLVM_VectorReduction<"or">; -def LLVM_experimental_vector_reduce_smax : LLVM_VectorReduction<"smax">; -def LLVM_experimental_vector_reduce_smin : LLVM_VectorReduction<"smin">; -def LLVM_experimental_vector_reduce_umax : LLVM_VectorReduction<"umax">; -def LLVM_experimental_vector_reduce_umin : LLVM_VectorReduction<"umin">; -def LLVM_experimental_vector_reduce_xor : LLVM_VectorReduction<"xor">; - -def LLVM_experimental_vector_reduce_v2_fadd : LLVM_VectorReductionV2<"fadd">; -def LLVM_experimental_vector_reduce_v2_fmul : LLVM_VectorReductionV2<"fmul">; +def LLVM_vector_reduce_add : LLVM_VectorReduction<"add">; +def LLVM_vector_reduce_and : LLVM_VectorReduction<"and">; +def LLVM_vector_reduce_mul : LLVM_VectorReduction<"mul">; +def LLVM_vector_reduce_fmax : LLVM_VectorReduction<"fmax">; +def LLVM_vector_reduce_fmin : LLVM_VectorReduction<"fmin">; +def LLVM_vector_reduce_or : LLVM_VectorReduction<"or">; +def LLVM_vector_reduce_smax : LLVM_VectorReduction<"smax">; +def LLVM_vector_reduce_smin : LLVM_VectorReduction<"smin">; +def LLVM_vector_reduce_umax : LLVM_VectorReduction<"umax">; +def LLVM_vector_reduce_umin : LLVM_VectorReduction<"umin">; +def LLVM_vector_reduce_xor : LLVM_VectorReduction<"xor">; + +def LLVM_vector_reduce_fadd : LLVM_VectorReductionAcc<"fadd">; +def LLVM_vector_reduce_fmul : LLVM_VectorReductionAcc<"fmul">; // // LLVM Matrix operations. diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td --- a/mlir/include/mlir/Dialect/Vector/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td @@ -207,7 +207,7 @@ Note that these operations are restricted to 1-D vectors to remain close to the corresponding LLVM intrinsics: - http://llvm.org/docs/LangRef.html#experimental-vector-reduction-intrinsics + http://llvm.org/docs/LangRef.html#vector-reduction-intrinsics Example: diff --git a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir --- a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir +++ b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-fp.mlir @@ -24,61 +24,61 @@ %12 = llvm.mlir.constant(3 : i64) : !llvm.i64 %v = llvm.insertelement %3, %11[%12 : !llvm.i64] : !llvm.vec<4 x float> - %max = "llvm.intr.experimental.vector.reduce.fmax"(%v) + %max = "llvm.intr.vector.reduce.fmax"(%v) : (!llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%max) : (!llvm.float) -> () llvm.call @printNewline() : () -> () // CHECK: 4 - %min = "llvm.intr.experimental.vector.reduce.fmin"(%v) + %min = "llvm.intr.vector.reduce.fmin"(%v) : (!llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%min) : (!llvm.float) -> () llvm.call @printNewline() : () -> () // CHECK: 1 - %add1 = "llvm.intr.experimental.vector.reduce.v2.fadd"(%0, %v) + %add1 = "llvm.intr.vector.reduce.fadd"(%0, %v) : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%add1) : (!llvm.float) -> () llvm.call @printNewline() : () -> () // CHECK: 11 - %add1r = "llvm.intr.experimental.vector.reduce.v2.fadd"(%0, %v) + %add1r = "llvm.intr.vector.reduce.fadd"(%0, %v) {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%add1r) : (!llvm.float) -> () llvm.call @printNewline() : () -> () // CHECK: 11 - %add2 = "llvm.intr.experimental.vector.reduce.v2.fadd"(%1, %v) + %add2 = "llvm.intr.vector.reduce.fadd"(%1, %v) : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%add2) : (!llvm.float) -> () llvm.call @printNewline() : () -> () // CHECK: 12 - %add2r = "llvm.intr.experimental.vector.reduce.v2.fadd"(%1, %v) + %add2r = "llvm.intr.vector.reduce.fadd"(%1, %v) {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%add2r) : (!llvm.float) -> () llvm.call @printNewline() : () -> () // CHECK: 12 - %mul1 = "llvm.intr.experimental.vector.reduce.v2.fmul"(%0, %v) + %mul1 = "llvm.intr.vector.reduce.fmul"(%0, %v) : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%mul1) : (!llvm.float) -> () llvm.call @printNewline() : () -> () // CHECK: 24 - %mul1r = "llvm.intr.experimental.vector.reduce.v2.fmul"(%0, %v) + %mul1r = "llvm.intr.vector.reduce.fmul"(%0, %v) {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%mul1r) : (!llvm.float) -> () llvm.call @printNewline() : () -> () // CHECK: 24 - %mul2 = "llvm.intr.experimental.vector.reduce.v2.fmul"(%1, %v) + %mul2 = "llvm.intr.vector.reduce.fmul"(%1, %v) : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%mul2) : (!llvm.float) -> () llvm.call @printNewline() : () -> () // CHECK: 48 - %mul2r = "llvm.intr.experimental.vector.reduce.v2.fmul"(%1, %v) + %mul2r = "llvm.intr.vector.reduce.fmul"(%1, %v) {reassoc = true} : (!llvm.float, !llvm.vec<4 x float>) -> !llvm.float llvm.call @printF32(%mul2r) : (!llvm.float) -> () llvm.call @printNewline() : () -> () diff --git a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir --- a/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir +++ b/mlir/integration_test/Dialect/LLVMIR/CPU/test-vector-reductions-int.mlir @@ -24,55 +24,55 @@ %12 = llvm.mlir.constant(3 : i64) : !llvm.i64 %v = llvm.insertelement %3, %11[%12 : !llvm.i64] : !llvm.vec<4 x i64> - %add = "llvm.intr.experimental.vector.reduce.add"(%v) + %add = "llvm.intr.vector.reduce.add"(%v) : (!llvm.vec<4 x i64>) -> !llvm.i64 llvm.call @printI64(%add) : (!llvm.i64) -> () llvm.call @printNewline() : () -> () // CHECK: 10 - %and = "llvm.intr.experimental.vector.reduce.and"(%v) + %and = "llvm.intr.vector.reduce.and"(%v) : (!llvm.vec<4 x i64>) -> !llvm.i64 llvm.call @printI64(%and) : (!llvm.i64) -> () llvm.call @printNewline() : () -> () // CHECK: 0 - %mul = "llvm.intr.experimental.vector.reduce.mul"(%v) + %mul = "llvm.intr.vector.reduce.mul"(%v) : (!llvm.vec<4 x i64>) -> !llvm.i64 llvm.call @printI64(%mul) : (!llvm.i64) -> () llvm.call @printNewline() : () -> () // CHECK: 24 - %or = "llvm.intr.experimental.vector.reduce.or"(%v) + %or = "llvm.intr.vector.reduce.or"(%v) : (!llvm.vec<4 x i64>) -> !llvm.i64 llvm.call @printI64(%or) : (!llvm.i64) -> () llvm.call @printNewline() : () -> () // CHECK: 7 - %smax = "llvm.intr.experimental.vector.reduce.smax"(%v) + %smax = "llvm.intr.vector.reduce.smax"(%v) : (!llvm.vec<4 x i64>) -> !llvm.i64 llvm.call @printI64(%smax) : (!llvm.i64) -> () llvm.call @printNewline() : () -> () // CHECK: 4 - %smin = "llvm.intr.experimental.vector.reduce.smin"(%v) + %smin = "llvm.intr.vector.reduce.smin"(%v) : (!llvm.vec<4 x i64>) -> !llvm.i64 llvm.call @printI64(%smin) : (!llvm.i64) -> () llvm.call @printNewline() : () -> () // CHECK: 1 - %umax = "llvm.intr.experimental.vector.reduce.umax"(%v) + %umax = "llvm.intr.vector.reduce.umax"(%v) : (!llvm.vec<4 x i64>) -> !llvm.i64 llvm.call @printI64(%umax) : (!llvm.i64) -> () llvm.call @printNewline() : () -> () // CHECK: 4 - %umin = "llvm.intr.experimental.vector.reduce.umin"(%v) + %umin = "llvm.intr.vector.reduce.umin"(%v) : (!llvm.vec<4 x i64>) -> !llvm.i64 llvm.call @printI64(%umin) : (!llvm.i64) -> () llvm.call @printNewline() : () -> () // CHECK: 1 - %xor = "llvm.intr.experimental.vector.reduce.xor"(%v) + %xor = "llvm.intr.vector.reduce.xor"(%v) : (!llvm.vec<4 x i64>) -> !llvm.i64 llvm.call @printI64(%xor) : (!llvm.i64) -> () llvm.call @printNewline() : () -> () diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -564,33 +564,33 @@ if (eltType.isIntOrIndex()) { // Integer reductions: add/mul/min/max/and/or/xor. if (kind == "add") - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else if (kind == "mul") - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else if (kind == "min" && (eltType.isIndex() || eltType.isUnsignedInteger())) - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else if (kind == "min") - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else if (kind == "max" && (eltType.isIndex() || eltType.isUnsignedInteger())) - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else if (kind == "max") - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else if (kind == "and") - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else if (kind == "or") - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else if (kind == "xor") - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else return failure(); @@ -604,7 +604,7 @@ : rewriter.create( op->getLoc(), llvmType, rewriter.getZeroAttr(eltType)); - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, acc, operands[0], rewriter.getBoolAttr(reassociateFPReductions)); } else if (kind == "mul") { @@ -614,14 +614,14 @@ : rewriter.create( op->getLoc(), llvmType, rewriter.getFloatAttr(eltType, 1.0)); - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, acc, operands[0], rewriter.getBoolAttr(reassociateFPReductions)); } else if (kind == "min") - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else if (kind == "max") - rewriter.replaceOpWithNewOp( + rewriter.replaceOpWithNewOp( op, llvmType, operands[0]); else return failure(); diff --git a/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir --- a/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-reduction-to-llvm.mlir @@ -5,14 +5,14 @@ // CHECK-LABEL: llvm.func @reduce_add_f32( // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x float>) // CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float -// CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]]) +// CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]]) // CHECK-SAME: {reassoc = false} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float // CHECK: llvm.return %[[V]] : !llvm.float // // REASSOC-LABEL: llvm.func @reduce_add_f32( // REASSOC-SAME: %[[A:.*]]: !llvm.vec<16 x float>) // REASSOC: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float -// REASSOC: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]]) +// REASSOC: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]]) // REASSOC-SAME: {reassoc = true} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float // REASSOC: llvm.return %[[V]] : !llvm.float // @@ -25,14 +25,14 @@ // CHECK-LABEL: llvm.func @reduce_mul_f32( // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x float>) // CHECK: %[[C:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float -// CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fmul"(%[[C]], %[[A]]) +// CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fmul"(%[[C]], %[[A]]) // CHECK-SAME: {reassoc = false} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float // CHECK: llvm.return %[[V]] : !llvm.float // // REASSOC-LABEL: llvm.func @reduce_mul_f32( // REASSOC-SAME: %[[A:.*]]: !llvm.vec<16 x float>) // REASSOC: %[[C:.*]] = llvm.mlir.constant(1.000000e+00 : f32) : !llvm.float -// REASSOC: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fmul"(%[[C]], %[[A]]) +// REASSOC: %[[V:.*]] = "llvm.intr.vector.reduce.fmul"(%[[C]], %[[A]]) // REASSOC-SAME: {reassoc = true} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float // REASSOC: llvm.return %[[V]] : !llvm.float // diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir @@ -774,7 +774,7 @@ // CHECK-LABEL: llvm.func @reduce_f16( // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x half>) // CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f16) : !llvm.half -// CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]]) +// CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]]) // CHECK-SAME: {reassoc = false} : (!llvm.half, !llvm.vec<16 x half>) -> !llvm.half // CHECK: llvm.return %[[V]] : !llvm.half @@ -785,7 +785,7 @@ // CHECK-LABEL: llvm.func @reduce_f32( // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x float>) // CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float -// CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]]) +// CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]]) // CHECK-SAME: {reassoc = false} : (!llvm.float, !llvm.vec<16 x float>) -> !llvm.float // CHECK: llvm.return %[[V]] : !llvm.float @@ -796,7 +796,7 @@ // CHECK-LABEL: llvm.func @reduce_f64( // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x double>) // CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f64) : !llvm.double -// CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.v2.fadd"(%[[C]], %[[A]]) +// CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]]) // CHECK-SAME: {reassoc = false} : (!llvm.double, !llvm.vec<16 x double>) -> !llvm.double // CHECK: llvm.return %[[V]] : !llvm.double @@ -806,7 +806,7 @@ } // CHECK-LABEL: llvm.func @reduce_i8( // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x i8>) -// CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.add"(%[[A]]) +// CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]]) // CHECK: llvm.return %[[V]] : !llvm.i8 func @reduce_i32(%arg0: vector<16xi32>) -> i32 { @@ -815,7 +815,7 @@ } // CHECK-LABEL: llvm.func @reduce_i32( // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x i32>) -// CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.add"(%[[A]]) +// CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]]) // CHECK: llvm.return %[[V]] : !llvm.i32 func @reduce_i64(%arg0: vector<16xi64>) -> i64 { @@ -824,7 +824,7 @@ } // CHECK-LABEL: llvm.func @reduce_i64( // CHECK-SAME: %[[A:.*]]: !llvm.vec<16 x i64>) -// CHECK: %[[V:.*]] = "llvm.intr.experimental.vector.reduce.add"(%[[A]]) +// CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]]) // CHECK: llvm.return %[[V]] : !llvm.i64 diff --git a/mlir/test/Target/llvmir-intrinsics.mlir b/mlir/test/Target/llvmir-intrinsics.mlir --- a/mlir/test/Target/llvmir-intrinsics.mlir +++ b/mlir/test/Target/llvmir-intrinsics.mlir @@ -182,36 +182,36 @@ // CHECK-LABEL: @vector_reductions llvm.func @vector_reductions(%arg0: !llvm.float, %arg1: !llvm.vec<8 x float>, %arg2: !llvm.vec<8 x i32>) { - // CHECK: call i32 @llvm.experimental.vector.reduce.add.v8i32 - "llvm.intr.experimental.vector.reduce.add"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 - // CHECK: call i32 @llvm.experimental.vector.reduce.and.v8i32 - "llvm.intr.experimental.vector.reduce.and"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 - // CHECK: call float @llvm.experimental.vector.reduce.fmax.v8f32 - "llvm.intr.experimental.vector.reduce.fmax"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float - // CHECK: call float @llvm.experimental.vector.reduce.fmin.v8f32 - "llvm.intr.experimental.vector.reduce.fmin"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float - // CHECK: call i32 @llvm.experimental.vector.reduce.mul.v8i32 - "llvm.intr.experimental.vector.reduce.mul"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 - // CHECK: call i32 @llvm.experimental.vector.reduce.or.v8i32 - "llvm.intr.experimental.vector.reduce.or"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 - // CHECK: call i32 @llvm.experimental.vector.reduce.smax.v8i32 - "llvm.intr.experimental.vector.reduce.smax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 - // CHECK: call i32 @llvm.experimental.vector.reduce.smin.v8i32 - "llvm.intr.experimental.vector.reduce.smin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 - // CHECK: call i32 @llvm.experimental.vector.reduce.umax.v8i32 - "llvm.intr.experimental.vector.reduce.umax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 - // CHECK: call i32 @llvm.experimental.vector.reduce.umin.v8i32 - "llvm.intr.experimental.vector.reduce.umin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 - // CHECK: call float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32 - "llvm.intr.experimental.vector.reduce.v2.fadd"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float - // CHECK: call float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32 - "llvm.intr.experimental.vector.reduce.v2.fmul"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float - // CHECK: call reassoc float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32 - "llvm.intr.experimental.vector.reduce.v2.fadd"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float - // CHECK: call reassoc float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32 - "llvm.intr.experimental.vector.reduce.v2.fmul"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float - // CHECK: call i32 @llvm.experimental.vector.reduce.xor.v8i32 - "llvm.intr.experimental.vector.reduce.xor"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 + // CHECK: call i32 @llvm.vector.reduce.add.v8i32 + "llvm.intr.vector.reduce.add"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 + // CHECK: call i32 @llvm.vector.reduce.and.v8i32 + "llvm.intr.vector.reduce.and"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 + // CHECK: call float @llvm.vector.reduce.fmax.v8f32 + "llvm.intr.vector.reduce.fmax"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float + // CHECK: call float @llvm.vector.reduce.fmin.v8f32 + "llvm.intr.vector.reduce.fmin"(%arg1) : (!llvm.vec<8 x float>) -> !llvm.float + // CHECK: call i32 @llvm.vector.reduce.mul.v8i32 + "llvm.intr.vector.reduce.mul"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 + // CHECK: call i32 @llvm.vector.reduce.or.v8i32 + "llvm.intr.vector.reduce.or"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 + // CHECK: call i32 @llvm.vector.reduce.smax.v8i32 + "llvm.intr.vector.reduce.smax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 + // CHECK: call i32 @llvm.vector.reduce.smin.v8i32 + "llvm.intr.vector.reduce.smin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 + // CHECK: call i32 @llvm.vector.reduce.umax.v8i32 + "llvm.intr.vector.reduce.umax"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 + // CHECK: call i32 @llvm.vector.reduce.umin.v8i32 + "llvm.intr.vector.reduce.umin"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 + // CHECK: call float @llvm.vector.reduce.fadd.f32.v8f32 + "llvm.intr.vector.reduce.fadd"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float + // CHECK: call float @llvm.vector.reduce.fmul.f32.v8f32 + "llvm.intr.vector.reduce.fmul"(%arg0, %arg1) : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float + // CHECK: call reassoc float @llvm.vector.reduce.fadd.f32.v8f32 + "llvm.intr.vector.reduce.fadd"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float + // CHECK: call reassoc float @llvm.vector.reduce.fmul.f32.v8f32 + "llvm.intr.vector.reduce.fmul"(%arg0, %arg1) {reassoc = true} : (!llvm.float, !llvm.vec<8 x float>) -> !llvm.float + // CHECK: call i32 @llvm.vector.reduce.xor.v8i32 + "llvm.intr.vector.reduce.xor"(%arg2) : (!llvm.vec<8 x i32>) -> !llvm.i32 llvm.return }