Index: llvm/lib/Passes/PassBuilderPipelines.cpp =================================================================== --- llvm/lib/Passes/PassBuilderPipelines.cpp +++ llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1249,6 +1249,9 @@ // flattening of blocks. OptimizePM.addPass(DivRemPairsPass()); + // Try to annotate calls that were created during optimization. + OptimizePM.addPass(TailCallElimPass()); + // LoopSink (and other loop passes since the last simplifyCFG) might have // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. OptimizePM.addPass( Index: llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -276,7 +276,7 @@ ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half ; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 5.000000e-01) -; GCN-PRELINK: %__pow2sqrt = call fast float @_Z4sqrtf(float %tmp) +; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp) define amdgpu_kernel void @test_pow_half(float addrspace(1)* nocapture %a) { entry: %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1 @@ -288,7 +288,7 @@ ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf ; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float -5.000000e-01) -; GCN-PRELINK: %__pow2rsqrt = call fast float @_Z5rsqrtf(float %tmp) +; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp) define amdgpu_kernel void @test_pow_mhalf(float addrspace(1)* nocapture %a) { entry: %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1 @@ -349,10 +349,10 @@ ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow ; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 1.013000e+03) -; GCN-PRELINK: %__fabs = call fast float @_Z4fabsf(float %tmp) -; GCN-PRELINK: %__log2 = call fast float @_Z4log2f(float %__fabs) +; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp) +; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs) ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03 -; GCN-PRELINK: %__exp2 = call fast float @_Z4exp2f(float %__ylogx) +; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx) ; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32 ; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648 ; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32 @@ -369,13 +369,13 @@ ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr ; GCN-POSTLINK: call fast float @_Z4powrff(float %tmp, float %tmp1) -; GCN-PRELINK: %__log2 = call fast float @_Z4log2f(float %tmp) +; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp) ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1 -; GCN-PRELINK: %__exp2 = call fast float @_Z4exp2f(float %__ylogx) +; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx) ; GCN-PRELINK: store float %__exp2, float addrspace(1)* %a, align 4 -; GCN-NATIVE: %__log2 = call fast float @_Z11native_log2f(float %tmp) +; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp) ; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1 -; GCN-NATIVE: %__exp2 = call fast float @_Z11native_exp2f(float %__ylogx) +; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx) ; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4 define amdgpu_kernel void @test_powr(float addrspace(1)* nocapture %a) { entry: @@ -390,11 +390,11 @@ ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown ; GCN-POSTLINK: call fast float @_Z4pownfi(float %tmp, i32 %conv) ; GCN-PRELINK: %conv = fptosi float %tmp1 to i32 -; GCN-PRELINK: %__fabs = call fast float @_Z4fabsf(float %tmp) -; GCN-PRELINK: %__log2 = call fast float @_Z4log2f(float %__fabs) +; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp) +; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs) ; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F -; GCN-PRELINK: %__exp2 = call fast float @_Z4exp2f(float %__ylogx) +; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx) ; GCN-PRELINK: %__yeven = shl i32 %conv, 31 ; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32 ; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]] @@ -429,7 +429,7 @@ ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 2) -; GCN-PRELINK: %__rootn2sqrt = call fast float @_Z4sqrtf(float %tmp) +; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp) define amdgpu_kernel void @test_rootn_2(float addrspace(1)* nocapture %a) { entry: %tmp = load float, float addrspace(1)* %a, align 4 @@ -440,7 +440,7 @@ ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 3) -; GCN-PRELINK: %__rootn2cbrt = call fast float @_Z4cbrtf(float %tmp) +; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp) define amdgpu_kernel void @test_rootn_3(float addrspace(1)* nocapture %a) { entry: %tmp = load float, float addrspace(1)* %a, align 4 @@ -461,7 +461,7 @@ ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 -2) -; GCN-PRELINK: %__rootn2rsqrt = call fast float @_Z5rsqrtf(float %tmp) +; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp) define amdgpu_kernel void @test_rootn_m2(float addrspace(1)* nocapture %a) { entry: %tmp = load float, float addrspace(1)* %a, align 4 @@ -620,9 +620,9 @@ ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr ; GCN-NATIVE: %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4 -; GCN-NATIVE: %__log2 = call fast float @_Z11native_log2f(float %tmp) +; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp) ; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1 -; GCN-NATIVE: %__exp2 = call fast float @_Z11native_exp2f(float %__ylogx) +; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx) ; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4 define amdgpu_kernel void @test_use_native_powr(float addrspace(1)* nocapture %a) { entry: Index: llvm/test/Other/new-pm-defaults.ll =================================================================== --- llvm/test/Other/new-pm-defaults.ll +++ llvm/test/Other/new-pm-defaults.ll @@ -262,6 +262,7 @@ ; CHECK-O-NEXT: Running pass: LoopSinkPass ; CHECK-O-NEXT: Running pass: InstSimplifyPass ; CHECK-O-NEXT: Running pass: DivRemPairsPass +; CHECK-O-NEXT: Running pass: TailCallElimPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-EP-OPTIMIZER-LAST: Running pass: NoOpModulePass ; CHECK-HOT-COLD-SPLIT-NEXT: Running pass: HotColdSplittingPass Index: llvm/test/Other/new-pm-thinlto-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-defaults.ll +++ llvm/test/Other/new-pm-thinlto-defaults.ll @@ -226,6 +226,7 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifyPass ; CHECK-POSTLINK-O-NEXT: Running pass: DivRemPairsPass +; CHECK-POSTLINK-O-NEXT: Running pass: TailCallElimPass ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-POSTLINK-O-NEXT: Running pass: GlobalDCEPass ; CHECK-POSTLINK-O-NEXT: Running pass: ConstantMergePass Index: llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -195,6 +195,7 @@ ; CHECK-O-NEXT: Running pass: LoopSinkPass ; CHECK-O-NEXT: Running pass: InstSimplifyPass ; CHECK-O-NEXT: Running pass: DivRemPairsPass +; CHECK-O-NEXT: Running pass: TailCallElimPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: GlobalDCEPass ; CHECK-O-NEXT: Running pass: ConstantMergePass Index: llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -207,6 +207,7 @@ ; CHECK-O-NEXT: Running pass: LoopSinkPass ; CHECK-O-NEXT: Running pass: InstSimplifyPass ; CHECK-O-NEXT: Running pass: DivRemPairsPass +; CHECK-O-NEXT: Running pass: TailCallElimPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: GlobalDCEPass ; CHECK-O-NEXT: Running pass: ConstantMergePass Index: llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll =================================================================== --- llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll +++ llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll @@ -53,8 +53,8 @@ ; CHECK-NEXT: store i32* [[ARRAY:%.*]], i32** [[TMP0]], align 8 ; CHECK-NEXT: [[LOAD_I:%.*]] = load i32, i32* [[ARRAY]], align 4 ; CHECK-NEXT: [[LOAD_POS_I:%.*]] = icmp sgt i32 [[LOAD_I]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[LOAD_I]], i32 0) -; CHECK-NEXT: call void @print(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[LOAD_I]], i32 0) +; CHECK-NEXT: tail call void @print(i32 [[TMP1]]) ; CHECK-NEXT: [[CONT_CAST:%.*]] = select i1 [[LOAD_POS_I]], void (i8*, i1)* @f.resume.0, void (i8*, i1)* @f.resume.1 ; CHECK-NEXT: call void [[CONT_CAST]](i8* nonnull [[DOTSUB]], i1 zeroext false) ; CHECK-NEXT: ret void Index: llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll @@ -15,14 +15,14 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <2 x float>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> -; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x double> @__sind2(<2 x double> [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = fadd fast <2 x double> [[TMP4]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP5]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]]) +; CHECK-NEXT: [[TMP7:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]]) ; CHECK-NEXT: ret double [[TMP7]] ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -291,7 +291,7 @@ define double @external_use_with_fast_math(double* %a, i64 %n) { ; AUTO_VEC-LABEL: @external_use_with_fast_math( ; AUTO_VEC-NEXT: entry: -; AUTO_VEC-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; AUTO_VEC-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16 ; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: @@ -451,7 +451,7 @@ define double @external_use_without_fast_math(double* %a, i64 %n) { ; AUTO_VEC-LABEL: @external_use_without_fast_math( ; AUTO_VEC-NEXT: entry: -; AUTO_VEC-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; AUTO_VEC-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) ; AUTO_VEC-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 ; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[SMAX]], 7 ; AUTO_VEC-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7 Index: llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll +++ llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll @@ -31,9 +31,9 @@ ; CHECK-NEXT: br i1 [[C_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK: loop.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 -; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[SUB_I7_PEEL]], i64 [[TMP0]]) +; CHECK-NEXT: [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I7_PEEL]], i64 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[UMIN]] -; CHECK-NEXT: [[UMIN16:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I]]) +; CHECK-NEXT: [[UMIN16:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I]]) ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[UMIN16]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 5 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER21:%.*]], label [[VECTOR_PH:%.*]] @@ -71,7 +71,7 @@ ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP19:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-NEXT: br label [[LOOP_PREHEADER21]] ; CHECK: loop.preheader21: ; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] @@ -157,10 +157,10 @@ ; CHECK-NEXT: br i1 [[COND_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK: loop.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 -; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[SUB_I19_PEEL]], i64 [[TMP0]]) +; CHECK-NEXT: [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I19_PEEL]], i64 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[UMIN]] -; CHECK-NEXT: [[UMIN28:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I7_PEEL]]) -; CHECK-NEXT: [[UMIN29:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN28]], i64 [[SUB_I]]) +; CHECK-NEXT: [[UMIN28:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I7_PEEL]]) +; CHECK-NEXT: [[UMIN29:%.*]] = tail call i64 @llvm.umin.i64(i64 [[UMIN28]], i64 [[SUB_I]]) ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[UMIN29]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 5 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER36:%.*]], label [[VECTOR_PH:%.*]] @@ -206,7 +206,7 @@ ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP23]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP25:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-NEXT: br label [[LOOP_PREHEADER36]] ; CHECK: loop.preheader36: ; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] Index: llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll +++ llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll @@ -25,7 +25,7 @@ ; CHECK-NEXT: [[TOBOOL_NOT1:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_NOT1]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true) +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.abs.i32(i32 [[N]], i1 true) ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[N_ADDR_03:%.*]] = phi i32 [ [[TMP1:%.*]], [[WHILE_BODY]] ], [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ] Index: llvm/test/Transforms/PhaseOrdering/X86/loop-idiom-vs-indvars.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/loop-idiom-vs-indvars.ll +++ llvm/test/Transforms/PhaseOrdering/X86/loop-idiom-vs-indvars.ll @@ -12,7 +12,7 @@ ; ALL-LABEL: @cttz( ; ALL-NEXT: entry: ; ALL-NEXT: [[TMP0:%.*]] = shl i32 [[N:%.*]], 1 -; ALL-NEXT: [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP0]], i1 false), [[RNG0:!range !.*]] +; ALL-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[TMP0]], i1 false), !range [[RNG0:![0-9]+]] ; ALL-NEXT: [[TMP2:%.*]] = sub nuw nsw i32 32, [[TMP1]] ; ALL-NEXT: [[TMP3:%.*]] = sub nuw nsw i32 75, [[TMP1]] ; ALL-NEXT: store i32 [[TMP3]], i32* [[P1:%.*]], align 4 Index: llvm/test/Transforms/PhaseOrdering/X86/vector-reduction-known-first-value.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/vector-reduction-known-first-value.ll +++ llvm/test/Transforms/PhaseOrdering/X86/vector-reduction-known-first-value.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP7:%.*]] = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[BIN_RDX]]) ; CHECK-NEXT: ret i16 [[TMP7]] ; entry: Index: llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -8,7 +8,7 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @ext_ext_or_reduction_v4i32( ; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[Z]]) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[Z]]) ; CHECK-NEXT: ret i32 [[TMP1]] ; %z = and <4 x i32> %x, %y @@ -42,7 +42,7 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @ext_ext_partial_add_reduction_and_extra_add_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[TMP2]] ; %y0 = extractelement <4 x i32> %y, i32 0 @@ -66,8 +66,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 true) -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP5_NOT:%.*]] = icmp sle i32 [[TMP6]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND6:%.*]] = zext i1 [[CMP5_NOT]] to i32 ; CHECK-NEXT: ret i32 [[COND6]] @@ -126,7 +126,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3_NOT]] to i32 ; CHECK-NEXT: ret i32 [[COND]] @@ -173,8 +173,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) ; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ole float [[TMP6]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND5:%.*]] = zext i1 [[CMP4]] to i32 ; CHECK-NEXT: ret i32 [[COND5]] @@ -233,7 +233,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[VEC1:%.*]] to <4 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x float> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) ; CHECK-NEXT: [[CMP3:%.*]] = fcmp fast ole float [[TMP5]], [[TOLERANCE:%.*]] ; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP3]] to i32 ; CHECK-NEXT: ret i32 [[COND]] Index: llvm/test/Transforms/PhaseOrdering/memset-tail.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/memset-tail.ll +++ llvm/test/Transforms/PhaseOrdering/memset-tail.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: br i1 [[CMP_NOT1]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[C]] to i64 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[D:%.*]], i8 0, i64 [[TMP0]], i1 false) +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 1 [[D:%.*]], i8 0, i64 [[TMP0]], i1 false) ; CHECK-NEXT: br label [[WHILE_END]] ; CHECK: while.end: ; CHECK-NEXT: ret void Index: llvm/test/Transforms/PhaseOrdering/min-max-abs-cse.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/min-max-abs-cse.ll +++ llvm/test/Transforms/PhaseOrdering/min-max-abs-cse.ll @@ -14,7 +14,7 @@ ; CHECK-NEXT: [[SUB:%.*]] = sub nsw i8 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i8 [[A]], [[B]] ; CHECK-NEXT: [[M1:%.*]] = select i1 [[CMP1]], i8 0, i8 [[SUB]] -; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[SUB]], i8 0) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i8 @llvm.smax.i8(i8 [[SUB]], i8 0) ; CHECK-NEXT: [[R:%.*]] = sub i8 [[TMP1]], [[M1]] ; CHECK-NEXT: ret i8 [[R]] ; @@ -31,7 +31,7 @@ define i8 @abs_swapped(i8 %a) { ; CHECK-LABEL: @abs_swapped( -; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false) ; CHECK-NEXT: ret i8 [[TMP1]] ; %neg = sub i8 0, %a @@ -77,7 +77,7 @@ define i8 @nabs_different_constants(i8 %a) { ; CHECK-LABEL: @nabs_different_constants( -; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false) ; CHECK-NEXT: [[M1:%.*]] = sub i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[M1]] ; Index: llvm/test/Transforms/PhaseOrdering/minmax.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/minmax.ll +++ llvm/test/Transforms/PhaseOrdering/minmax.ll @@ -10,13 +10,13 @@ define void @cmyk(i8 %r, i8 %g, i8 %b) { ; CHECK-LABEL: @cmyk( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.smax.i8(i8 [[R:%.*]], i8 [[G:%.*]]) -; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[B:%.*]], i8 [[TMP0]]) +; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.smax.i8(i8 [[R:%.*]], i8 [[G:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i8 @llvm.smax.i8(i8 [[B:%.*]], i8 [[TMP0]]) ; CHECK-NEXT: [[K_0:%.*]] = xor i8 [[TMP1]], -1 ; CHECK-NEXT: [[SUB31:%.*]] = sub i8 [[TMP1]], [[R]] ; CHECK-NEXT: [[SUB35:%.*]] = sub i8 [[TMP1]], [[G]] ; CHECK-NEXT: [[SUB39:%.*]] = sub i8 [[TMP1]], [[B]] -; CHECK-NEXT: call void @use(i8 [[SUB31]], i8 [[SUB35]], i8 [[SUB39]], i8 [[K_0]]) +; CHECK-NEXT: tail call void @use(i8 [[SUB31]], i8 [[SUB35]], i8 [[SUB39]], i8 [[K_0]]) ; CHECK-NEXT: ret void ; entry: Index: llvm/test/Transforms/PhaseOrdering/single-iteration-loop-sroa.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/single-iteration-loop-sroa.ll +++ llvm/test/Transforms/PhaseOrdering/single-iteration-loop-sroa.ll @@ -58,7 +58,7 @@ define i16 @test(i16 %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: bb6.i.i.i: -; CHECK-NEXT: [[DATA_I_SROA_0_0_INSERT_INSERT:%.*]] = call i16 @llvm.bswap.i16(i16 [[ARG:%.*]]) +; CHECK-NEXT: [[DATA_I_SROA_0_0_INSERT_INSERT:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[ARG:%.*]]) ; CHECK-NEXT: ret i16 [[DATA_I_SROA_0_0_INSERT_INSERT]] ; %ret = call i16 @helper(i16 %arg, i64 1)