Index: llvm/trunk/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/trunk/include/llvm/IR/Intrinsics.td +++ llvm/trunk/include/llvm/IR/Intrinsics.td @@ -156,10 +156,15 @@ // the intrinsic is overloaded, so the matched type should be declared as iAny. class LLVMExtendedType : LLVMMatchType; class LLVMTruncatedType : LLVMMatchType; -class LLVMVectorSameWidth - : LLVMMatchType { + +// Match the scalar/vector of another intrinsic parameter but with a different +// element type. Either both are scalars or both are vectors with the same +// number of elements. +class LLVMScalarOrSameVectorWidth + : LLVMMatchType { ValueType ElTy = elty.VT; } + class LLVMPointerTo : LLVMMatchType; class LLVMPointerToElt : LLVMMatchType; class LLVMVectorOfAnyPointersToElt : LLVMMatchType; @@ -796,24 +801,30 @@ // // Expose the carry flag from add operations on two integrals. -def int_sadd_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty], +def int_sadd_with_overflow : Intrinsic<[llvm_anyint_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; -def int_uadd_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty], +def int_uadd_with_overflow : Intrinsic<[llvm_anyint_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; -def int_ssub_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty], +def int_ssub_with_overflow : Intrinsic<[llvm_anyint_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; -def int_usub_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty], +def int_usub_with_overflow : Intrinsic<[llvm_anyint_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; -def int_smul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty], +def int_smul_with_overflow : Intrinsic<[llvm_anyint_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; -def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty], +def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; @@ -1001,35 +1012,35 @@ def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMAnyPointerType>, llvm_i32_ty, - LLVMVectorSameWidth<0, llvm_i1_ty>], + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [IntrArgMemOnly]>; def int_masked_load : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType>, llvm_i32_ty, - LLVMVectorSameWidth<0, llvm_i1_ty>, LLVMMatchType<0>], + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], [IntrReadMem, IntrArgMemOnly]>; def int_masked_gather: Intrinsic<[llvm_anyvector_ty], [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty, - LLVMVectorSameWidth<0, llvm_i1_ty>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], [IntrReadMem]>; def int_masked_scatter: Intrinsic<[], [llvm_anyvector_ty, LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty, - LLVMVectorSameWidth<0, llvm_i1_ty>]>; + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>]>; def int_masked_expandload: Intrinsic<[llvm_anyvector_ty], [LLVMPointerToElt<0>, - LLVMVectorSameWidth<0, llvm_i1_ty>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>], [IntrReadMem]>; def int_masked_compressstore: Intrinsic<[], [llvm_anyvector_ty, LLVMPointerToElt<0>, - LLVMVectorSameWidth<0, llvm_i1_ty>], + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [IntrArgMemOnly]>; // Test whether a pointer is associated with a type metadata identifier. Index: llvm/trunk/lib/IR/Function.cpp =================================================================== --- llvm/trunk/lib/IR/Function.cpp +++ llvm/trunk/lib/IR/Function.cpp @@ -948,10 +948,9 @@ case IITDescriptor::SameVecWidthArgument: { Type *EltTy = DecodeFixedType(Infos, Tys, Context); Type *Ty = Tys[D.getArgumentNumber()]; - if (VectorType *VTy = dyn_cast(Ty)) { + if (auto *VTy = dyn_cast(Ty)) return VectorType::get(EltTy, VTy->getNumElements()); - } - llvm_unreachable("unhandled"); + return EltTy; } case IITDescriptor::PtrToArgument: { Type *Ty = Tys[D.getArgumentNumber()]; @@ -1135,15 +1134,19 @@ case IITDescriptor::SameVecWidthArgument: { if (D.getArgumentNumber() >= ArgTys.size()) return true; - VectorType * ReferenceType = - dyn_cast(ArgTys[D.getArgumentNumber()]); - VectorType *ThisArgType = dyn_cast(Ty); - if (!ThisArgType || !ReferenceType || - (ReferenceType->getVectorNumElements() != - ThisArgType->getVectorNumElements())) + auto *ReferenceType = dyn_cast(ArgTys[D.getArgumentNumber()]); + auto *ThisArgType = dyn_cast(Ty); + // Both must be vectors of the same number of elements or neither. + if ((ReferenceType != nullptr) != (ThisArgType != nullptr)) return true; - return matchIntrinsicType(ThisArgType->getVectorElementType(), - Infos, ArgTys); + Type *EltTy = Ty; + if (ThisArgType) { + if (ReferenceType->getVectorNumElements() != + ThisArgType->getVectorNumElements()) + return true; + EltTy = ThisArgType->getVectorElementType(); + } + return matchIntrinsicType(EltTy, Infos, ArgTys); } case IITDescriptor::PtrToArgument: { if (D.getArgumentNumber() >= ArgTys.size()) Index: llvm/trunk/test/Analysis/CostModel/X86/arith-overflow.ll =================================================================== --- llvm/trunk/test/Analysis/CostModel/X86/arith-overflow.ll +++ llvm/trunk/test/Analysis/CostModel/X86/arith-overflow.ll @@ -0,0 +1,414 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ +; +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SLM +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,GLM +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,BTVER2 + +; +; sadd.with.overflow +; + +declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) +declare {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) +declare {<4 x i64>, <4 x i1>} @llvm.sadd.with.overflow.v4i64(<4 x i64>, <4 x i64>) +declare {<8 x i64>, <8 x i1>} @llvm.sadd.with.overflow.v8i64(<8 x i64>, <8 x i64>) + +declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) +declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) +declare {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<16 x i16>, <16 x i1>} @llvm.sadd.with.overflow.v16i16(<16 x i16>, <16 x i16>) +declare {<32 x i16>, <32 x i1>} @llvm.sadd.with.overflow.v32i16(<32 x i16>, <32 x i16>) + +declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8) +declare {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<32 x i8>, <32 x i1>} @llvm.sadd.with.overflow.v32i8(<32 x i8>, <32 x i8>) +declare {<64 x i8>, <64 x i1>} @llvm.sadd.with.overflow.v64i8(<64 x i8>, <64 x i8>) + +define i32 @sadd(i32 %arg) { +; CHECK-LABEL: 'sadd' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 undef, i64 undef) + %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 undef, i32 undef) + %V4I32 = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 undef, i16 undef) + %V8I16 = call {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 undef, i8 undef) + %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} + +; +; uadd.with.overflow +; + +declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) +declare {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) +declare {<4 x i64>, <4 x i1>} @llvm.uadd.with.overflow.v4i64(<4 x i64>, <4 x i64>) +declare {<8 x i64>, <8 x i1>} @llvm.uadd.with.overflow.v8i64(<8 x i64>, <8 x i64>) + +declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) +declare {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) +declare {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<16 x i16>, <16 x i1>} @llvm.uadd.with.overflow.v16i16(<16 x i16>, <16 x i16>) +declare {<32 x i16>, <32 x i1>} @llvm.uadd.with.overflow.v32i16(<32 x i16>, <32 x i16>) + +declare {i8, i1} @llvm.uadd.with.overflow.i8(i8, i8) +declare {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<32 x i8>, <32 x i1>} @llvm.uadd.with.overflow.v32i8(<32 x i8>, <32 x i8>) +declare {<64 x i8>, <64 x i1>} @llvm.uadd.with.overflow.v64i8(<64 x i8>, <64 x i8>) + +define i32 @uadd(i32 %arg) { +; CHECK-LABEL: 'uadd' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 undef, i64 undef) + %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 undef, i32 undef) + %V4I32 = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 undef, i16 undef) + %V8I16 = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 undef, i8 undef) + %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} + +; +; ssub.with.overflow +; + +declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) +declare {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>) +declare {<4 x i64>, <4 x i1>} @llvm.ssub.with.overflow.v4i64(<4 x i64>, <4 x i64>) +declare {<8 x i64>, <8 x i1>} @llvm.ssub.with.overflow.v8i64(<8 x i64>, <8 x i64>) + +declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) +declare {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16) +declare {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<16 x i16>, <16 x i1>} @llvm.ssub.with.overflow.v16i16(<16 x i16>, <16 x i16>) +declare {<32 x i16>, <32 x i1>} @llvm.ssub.with.overflow.v32i16(<32 x i16>, <32 x i16>) + +declare {i8, i1} @llvm.ssub.with.overflow.i8(i8, i8) +declare {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<32 x i8>, <32 x i1>} @llvm.ssub.with.overflow.v32i8(<32 x i8>, <32 x i8>) +declare {<64 x i8>, <64 x i1>} @llvm.ssub.with.overflow.v64i8(<64 x i8>, <64 x i8>) + +define i32 @ssub(i32 %arg) { +; CHECK-LABEL: 'ssub' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 undef, i64 undef) + %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 undef, i32 undef) + %V4I32 = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call {<8 x i32>, <8 x i1>} @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 undef, i16 undef) + %V8I16 = call {<8 x i16>, <8 x i1>} @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 undef, i8 undef) + %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} + +; +; usub.with.overflow +; + +declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) +declare {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>) +declare {<4 x i64>, <4 x i1>} @llvm.usub.with.overflow.v4i64(<4 x i64>, <4 x i64>) +declare {<8 x i64>, <8 x i1>} @llvm.usub.with.overflow.v8i64(<8 x i64>, <8 x i64>) + +declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) +declare {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16) +declare {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<16 x i16>, <16 x i1>} @llvm.usub.with.overflow.v16i16(<16 x i16>, <16 x i16>) +declare {<32 x i16>, <32 x i1>} @llvm.usub.with.overflow.v32i16(<32 x i16>, <32 x i16>) + +declare {i8, i1} @llvm.usub.with.overflow.i8(i8, i8) +declare {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<32 x i8>, <32 x i1>} @llvm.usub.with.overflow.v32i8(<32 x i8>, <32 x i8>) +declare {<64 x i8>, <64 x i1>} @llvm.usub.with.overflow.v64i8(<64 x i8>, <64 x i8>) + +define i32 @usub(i32 %arg) { +; CHECK-LABEL: 'usub' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 undef, i64 undef) + %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 undef, i32 undef) + %V4I32 = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call {<8 x i32>, <8 x i1>} @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 undef, i16 undef) + %V8I16 = call {<8 x i16>, <8 x i1>} @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 undef, i8 undef) + %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} + +; +; smul.with.overflow +; + +declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) +declare {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64>, <2 x i64>) +declare {<4 x i64>, <4 x i1>} @llvm.smul.with.overflow.v4i64(<4 x i64>, <4 x i64>) +declare {<8 x i64>, <8 x i1>} @llvm.smul.with.overflow.v8i64(<8 x i64>, <8 x i64>) + +declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) +declare {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.smul.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.smul.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) +declare {<8 x i16>, <8 x i1>} @llvm.smul.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<16 x i16>, <16 x i1>} @llvm.smul.with.overflow.v16i16(<16 x i16>, <16 x i16>) +declare {<32 x i16>, <32 x i1>} @llvm.smul.with.overflow.v32i16(<32 x i16>, <32 x i16>) + +declare {i8, i1} @llvm.smul.with.overflow.i8(i8, i8) +declare {<16 x i8>, <16 x i1>} @llvm.smul.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<32 x i8>, <32 x i1>} @llvm.smul.with.overflow.v32i8(<32 x i8>, <32 x i8>) +declare {<64 x i8>, <64 x i1>} @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x i8>) + +define i32 @smul(i32 %arg) { +; CHECK-LABEL: 'smul' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef) + %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 undef, i32 undef) + %V4I32 = call {<4 x i32>, <4 x i1>} @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call {<8 x i32>, <8 x i1>} @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 undef, i16 undef) + %V8I16 = call {<8 x i16>, <8 x i1>} @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 undef, i8 undef) + %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} + +; +; umul.with.overflow +; + +declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) +declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>) +declare {<4 x i64>, <4 x i1>} @llvm.umul.with.overflow.v4i64(<4 x i64>, <4 x i64>) +declare {<8 x i64>, <8 x i1>} @llvm.umul.with.overflow.v8i64(<8 x i64>, <8 x i64>) + +declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) +declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>) +declare {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>) +declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>) + +declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) +declare {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>) +declare {<16 x i16>, <16 x i1>} @llvm.umul.with.overflow.v16i16(<16 x i16>, <16 x i16>) +declare {<32 x i16>, <32 x i1>} @llvm.umul.with.overflow.v32i16(<32 x i16>, <32 x i16>) + +declare {i8, i1} @llvm.umul.with.overflow.i8(i8, i8) +declare {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>) +declare {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>) +declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>) + +define i32 @umul(i32 %arg) { +; CHECK-LABEL: 'umul' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; + %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef) + %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) + %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) + %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) + + %I32 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 undef, i32 undef) + %V4I32 = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) + %V8I32 = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) + %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) + + %I16 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 undef, i16 undef) + %V8I16 = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) + %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) + %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) + + %I8 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 undef, i8 undef) + %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) + %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) + %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) + + ret i32 undef +} Index: llvm/trunk/utils/TableGen/CodeGenTarget.cpp =================================================================== --- llvm/trunk/utils/TableGen/CodeGenTarget.cpp +++ llvm/trunk/utils/TableGen/CodeGenTarget.cpp @@ -633,7 +633,7 @@ // overloaded, all the types can be specified directly. assert(((!TyEl->isSubClassOf("LLVMExtendedType") && !TyEl->isSubClassOf("LLVMTruncatedType") && - !TyEl->isSubClassOf("LLVMVectorSameWidth")) || + !TyEl->isSubClassOf("LLVMScalarOrSameVectorWidth")) || VT == MVT::iAny || VT == MVT::vAny) && "Expected iAny or vAny type"); } else Index: llvm/trunk/utils/TableGen/IntrinsicEmitter.cpp =================================================================== --- llvm/trunk/utils/TableGen/IntrinsicEmitter.cpp +++ llvm/trunk/utils/TableGen/IntrinsicEmitter.cpp @@ -269,7 +269,7 @@ Sig.push_back(IIT_TRUNC_ARG); else if (R->isSubClassOf("LLVMHalfElementsVectorType")) Sig.push_back(IIT_HALF_VEC_ARG); - else if (R->isSubClassOf("LLVMVectorSameWidth")) { + else if (R->isSubClassOf("LLVMScalarOrSameVectorWidth")) { Sig.push_back(IIT_SAME_VEC_WIDTH_ARG); Sig.push_back((Number << 3) | ArgCodes[Number]); MVT::SimpleValueType VT = getValueType(R->getValueAsDef("ElTy"));