Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -189,6 +189,9 @@ bool isFCmpOrdCheaperThanFCmpZero(Type *Ty); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; + bool areFunctionArgsABICompatible(const Function *Caller, + const Function *Callee, + SmallPtrSetImpl &Args) const; const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -3070,6 +3070,22 @@ return (RealCallerBits & RealCalleeBits) == RealCalleeBits; } +bool X86TTIImpl::areFunctionArgsABICompatible( + const Function *Caller, const Function *Callee, + SmallPtrSetImpl &Args) const { + if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args)) + return false; + + // If we get here, we know the target features match. If one function + // considers 512-bit vectors legal and the other does not, consider them + // incompatible. + // FIXME Look at the arguments and only consider 512 bit or larger vectors? + const TargetMachine &TM = getTLI()->getTargetMachine(); + + return TM.getSubtarget(*Caller).useAVX512Regs() == + TM.getSubtarget(*Callee).useAVX512Regs(); +} + const X86TTIImpl::TTI::MemCmpExpansionOptions * X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { // Only enable vector loads for equality comparison. Index: test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll =================================================================== --- /dev/null +++ test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll @@ -0,0 +1,184 @@ +; RUN: opt -S -argpromotion < %s | FileCheck %s +; RUN: opt -S -passes=argpromotion < %s | FileCheck %s +; Test that we only promote arguments when the caller/callee have compatible +; function attrubtes. + +target triple = "x86_64-unknown-linux-gnu" + +; This should promote +; CHECK-LABEL: @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* %arg, <8 x i64> %arg1.val) +define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #0 { +bb: + %tmp = load <8 x i64>, <8 x i64>* %arg1 + store <8 x i64> %tmp, <8 x i64>* %arg + ret void +} + +define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* %arg) #0 { +bb: + %tmp = alloca <8 x i64>, align 32 + %tmp2 = alloca <8 x i64>, align 32 + %tmp3 = bitcast <8 x i64>* %tmp to i8* + call void @llvm.memset.p0i8.i64(i8* align 32 %tmp3, i8 0, i64 32, i1 false) + call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* %tmp2, <8 x i64>* %tmp) + %tmp4 = load <8 x i64>, <8 x i64>* %tmp2, align 32 + store <8 x i64> %tmp4, <8 x i64>* %arg, align 2 + ret void +} + +; This should promote +; CHECK-LABEL: @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %arg, <8 x i64> %arg1.val) +define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #1 { +bb: + %tmp = load <8 x i64>, <8 x i64>* %arg1 + store <8 x i64> %tmp, <8 x i64>* %arg + ret void +} + +define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %arg) #1 { +bb: + %tmp = alloca <8 x i64>, align 32 + %tmp2 = alloca <8 x i64>, align 32 + %tmp3 = bitcast <8 x i64>* %tmp to i8* + call void @llvm.memset.p0i8.i64(i8* align 32 %tmp3, i8 0, i64 32, i1 false) + call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %tmp2, <8 x i64>* %tmp) + %tmp4 = load <8 x i64>, <8 x i64>* %tmp2, align 32 + store <8 x i64> %tmp4, <8 x i64>* %arg, align 2 + ret void +} + +; This should promote +; CHECK-LABEL: @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* %arg, <8 x i64> %arg1.val) +define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #1 { +bb: + %tmp = load <8 x i64>, <8 x i64>* %arg1 + store <8 x i64> %tmp, <8 x i64>* %arg + ret void +} + +define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* %arg) #0 { +bb: + %tmp = alloca <8 x i64>, align 32 + %tmp2 = alloca <8 x i64>, align 32 + %tmp3 = bitcast <8 x i64>* %tmp to i8* + call void @llvm.memset.p0i8.i64(i8* align 32 %tmp3, i8 0, i64 32, i1 false) + call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* %tmp2, <8 x i64>* %tmp) + %tmp4 = load <8 x i64>, <8 x i64>* %tmp2, align 32 + store <8 x i64> %tmp4, <8 x i64>* %arg, align 2 + ret void +} + +; This should promote +; CHECK-LABEL: @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* %arg, <8 x i64> %arg1.val) +define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #0 { +bb: + %tmp = load <8 x i64>, <8 x i64>* %arg1 + store <8 x i64> %tmp, <8 x i64>* %arg + ret void +} + +define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* %arg) #1 { +bb: + %tmp = alloca <8 x i64>, align 32 + %tmp2 = alloca <8 x i64>, align 32 + %tmp3 = bitcast <8 x i64>* %tmp to i8* + call void @llvm.memset.p0i8.i64(i8* align 32 %tmp3, i8 0, i64 32, i1 false) + call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* %tmp2, <8 x i64>* %tmp) + %tmp4 = load <8 x i64>, <8 x i64>* %tmp2, align 32 + store <8 x i64> %tmp4, <8 x i64>* %arg, align 2 + ret void +} + +; This should not promote +; CHECK-LABEL: @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) +define internal fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #1 { +bb: + %tmp = load <8 x i64>, <8 x i64>* %arg1 + store <8 x i64> %tmp, <8 x i64>* %arg + ret void +} + +define void @avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %arg) #2 { +bb: + %tmp = alloca <8 x i64>, align 32 + %tmp2 = alloca <8 x i64>, align 32 + %tmp3 = bitcast <8 x i64>* %tmp to i8* + call void @llvm.memset.p0i8.i64(i8* align 32 %tmp3, i8 0, i64 32, i1 false) + call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %tmp2, <8 x i64>* %tmp) + %tmp4 = load <8 x i64>, <8 x i64>* %tmp2, align 32 + store <8 x i64> %tmp4, <8 x i64>* %arg, align 2 + ret void +} + +; This should not promote +; CHECK-LABEL: @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) +define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #2 { +bb: + %tmp = load <8 x i64>, <8 x i64>* %arg1 + store <8 x i64> %tmp, <8 x i64>* %arg + ret void +} + +define void @avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* %arg) #1 { +bb: + %tmp = alloca <8 x i64>, align 32 + %tmp2 = alloca <8 x i64>, align 32 + %tmp3 = bitcast <8 x i64>* %tmp to i8* + call void @llvm.memset.p0i8.i64(i8* align 32 %tmp3, i8 0, i64 32, i1 false) + call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* %tmp2, <8 x i64>* %tmp) + %tmp4 = load <8 x i64>, <8 x i64>* %tmp2, align 32 + store <8 x i64> %tmp4, <8 x i64>* %arg, align 2 + ret void +} + +; This should promote +; CHECK-LABEL: @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %arg, <8 x i64> %arg1.val) +define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #3 { +bb: + %tmp = load <8 x i64>, <8 x i64>* %arg1 + store <8 x i64> %tmp, <8 x i64>* %arg + ret void +} + +define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %arg) #4 { +bb: + %tmp = alloca <8 x i64>, align 32 + %tmp2 = alloca <8 x i64>, align 32 + %tmp3 = bitcast <8 x i64>* %tmp to i8* + call void @llvm.memset.p0i8.i64(i8* align 32 %tmp3, i8 0, i64 32, i1 false) + call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %tmp2, <8 x i64>* %tmp) + %tmp4 = load <8 x i64>, <8 x i64>* %tmp2, align 32 + store <8 x i64> %tmp4, <8 x i64>* %arg, align 2 + ret void +} + +; This should promote +; CHECK-LABEL: @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %arg, <8 x i64> %arg1.val) +define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #4 { +bb: + %tmp = load <8 x i64>, <8 x i64>* %arg1 + store <8 x i64> %tmp, <8 x i64>* %arg + ret void +} + +define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %arg) #3 { +bb: + %tmp = alloca <8 x i64>, align 32 + %tmp2 = alloca <8 x i64>, align 32 + %tmp3 = bitcast <8 x i64>* %tmp to i8* + call void @llvm.memset.p0i8.i64(i8* align 32 %tmp3, i8 0, i64 32, i1 false) + call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %tmp2, <8 x i64>* %tmp) + %tmp4 = load <8 x i64>, <8 x i64>* %tmp2, align 32 + store <8 x i64> %tmp4, <8 x i64>* %arg, align 2 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5 + +attributes #0 = { inlinehint norecurse nounwind uwtable "target-features"="+avx512vl" "min-legal-vector-width"="512" "prefer-vector-width"="512" } +attributes #1 = { inlinehint norecurse nounwind uwtable "target-features"="+avx512vl" "min-legal-vector-width"="512" "prefer-vector-width"="256" } +attributes #2 = { inlinehint norecurse nounwind uwtable "target-features"="+avx512vl" "min-legal-vector-width"="256" "prefer-vector-width"="256" } +attributes #3 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2" "min-legal-vector-width"="512" "prefer-vector-width"="256" } +attributes #4 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2" "min-legal-vector-width"="256" "prefer-vector-width"="256" } +attributes #5 = { argmemonly nounwind }