diff --git a/llvm/lib/Transforms/Utils/SizeOpts.cpp b/llvm/lib/Transforms/Utils/SizeOpts.cpp --- a/llvm/lib/Transforms/Utils/SizeOpts.cpp +++ b/llvm/lib/Transforms/Utils/SizeOpts.cpp @@ -29,7 +29,7 @@ "to cold code.")); cl::opt PGSOIRPassOrTestOnly( - "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(true), + "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(false), cl::desc("Apply the profile guided size optimizations only" "to the IR passes or tests.")); diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll @@ -0,0 +1,128 @@ +; RUN: llc %s -enable-machine-outliner=never -mtriple=arm64-linux-gnu -o - | \ +; RUN: FileCheck --check-prefixes=CHECK,CHECK-LINUX %s +; ARM64: Calls to bzero() replaced with calls to memset() + +; CHECK-LABEL: fct1: +; For small size (<= 256), we do not change memset to bzero. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct1(i8* nocapture %ptr) !prof !14 { +entry: + tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) + +; CHECK-LABEL: fct2: +; When the size is bigger than 256, change into bzero. +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset +define void @fct2(i8* nocapture %ptr) !prof !14 { +entry: + tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i1 false) + ret void +} + +; CHECK-LABEL: fct3: +; For unknown size, change to bzero. +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset +define void @fct3(i8* nocapture %ptr, i32 %unknown) !prof !14 { +entry: + %conv = sext i32 %unknown to i64 + tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i1 false) + ret void +} + +; CHECK-LABEL: fct4: +; Size <= 256, no change. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct4(i8* %ptr) !prof !14 { +entry: + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp) + ret void +} + +declare i8* @__memset_chk(i8*, i32, i64, i64) + +declare i64 @llvm.objectsize.i64(i8*, i1) + +; CHECK-LABEL: fct5: +; Size > 256, change. +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset +define void @fct5(i8* %ptr) !prof !14 { +entry: + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp) + ret void +} + +; CHECK-LABEL: fct6: +; Size = unknown, change. +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset +define void @fct6(i8* %ptr, i32 %unknown) !prof !14 { +entry: + %conv = sext i32 %unknown to i64 + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp) + ret void +} + +; Next functions check that memset is not turned into bzero +; when the set constant is non-zero, whatever the given size. + +; CHECK-LABEL: fct7: +; memset with something that is not a zero, no change. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct7(i8* %ptr) !prof !14 { +entry: + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp) + ret void +} + +; CHECK-LABEL: fct8: +; memset with something that is not a zero, no change. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct8(i8* %ptr) !prof !14 { +entry: + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp) + ret void +} + +; CHECK-LABEL: fct9: +; memset with something that is not a zero, no change. +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset +define void @fct9(i8* %ptr, i32 %unknown) !prof !14 { +entry: + %conv = sext i32 %unknown to i64 + %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) + %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp) + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/AArch64/max-jump-table.ll b/llvm/test/CodeGen/AArch64/max-jump-table.ll --- a/llvm/test/CodeGen/AArch64/max-jump-table.ll +++ b/llvm/test/CodeGen/AArch64/max-jump-table.ll @@ -203,3 +203,136 @@ return: ret void } + +define i32 @jt1_optsize(i32 %a, i32 %b) optsize { +entry: + switch i32 %a, label %return [ + i32 1, label %bb1 + i32 2, label %bb2 + i32 3, label %bb3 + i32 4, label %bb4 + i32 5, label %bb5 + i32 6, label %bb6 + i32 7, label %bb7 + i32 8, label %bb8 + i32 9, label %bb9 + i32 10, label %bb10 + i32 11, label %bb11 + i32 12, label %bb12 + i32 13, label %bb13 + i32 14, label %bb14 + i32 15, label %bb15 + i32 16, label %bb16 + i32 17, label %bb17 + ] +; CHECK-LABEL: function jt1_optsize: +; CHECK-NEXT: Jump Tables: +; CHECK0-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK0-NOT: %jump-table.1: +; CHECK4-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK4-NOT: %jump-table.1: +; CHECK8-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK8-NOT: %jump-table.1: +; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK16-NOT: %jump-table.1: +; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECKM1-NOT: %jump-table.1: +; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECKM3-NOT: %jump-table.1: +; CHECK-DAG: End machine code for function jt1_optsize. + +bb1: tail call void @ext(i32 1, i32 0) br label %return +bb2: tail call void @ext(i32 2, i32 2) br label %return +bb3: tail call void @ext(i32 3, i32 4) br label %return +bb4: tail call void @ext(i32 4, i32 6) br label %return +bb5: tail call void @ext(i32 5, i32 8) br label %return +bb6: tail call void @ext(i32 6, i32 10) br label %return +bb7: tail call void @ext(i32 7, i32 12) br label %return +bb8: tail call void @ext(i32 8, i32 14) br label %return +bb9: tail call void @ext(i32 9, i32 16) br label %return +bb10: tail call void @ext(i32 1, i32 18) br label %return +bb11: tail call void @ext(i32 2, i32 20) br label %return +bb12: tail call void @ext(i32 3, i32 22) br label %return +bb13: tail call void @ext(i32 4, i32 24) br label %return +bb14: tail call void @ext(i32 5, i32 26) br label %return +bb15: tail call void @ext(i32 6, i32 28) br label %return +bb16: tail call void @ext(i32 7, i32 30) br label %return +bb17: tail call void @ext(i32 8, i32 32) br label %return + +return: ret i32 %b +} + +define i32 @jt1_pgso(i32 %a, i32 %b) !prof !14 { +entry: + switch i32 %a, label %return [ + i32 1, label %bb1 + i32 2, label %bb2 + i32 3, label %bb3 + i32 4, label %bb4 + i32 5, label %bb5 + i32 6, label %bb6 + i32 7, label %bb7 + i32 8, label %bb8 + i32 9, label %bb9 + i32 10, label %bb10 + i32 11, label %bb11 + i32 12, label %bb12 + i32 13, label %bb13 + i32 14, label %bb14 + i32 15, label %bb15 + i32 16, label %bb16 + i32 17, label %bb17 + ] +; CHECK-LABEL: function jt1_pgso: +; CHECK-NEXT: Jump Tables: +; CHECK0-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK0-NOT: %jump-table.1: +; CHECK4-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK4-NOT: %jump-table.1: +; CHECK8-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK8-NOT: %jump-table.1: +; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECK16-NOT: %jump-table.1: +; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECKM1-NOT: %jump-table.1: +; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17 +; CHECKM3-NOT: %jump-table.1: +; CHECK-DAG: End machine code for function jt1_pgso. + +bb1: tail call void @ext(i32 1, i32 0) br label %return +bb2: tail call void @ext(i32 2, i32 2) br label %return +bb3: tail call void @ext(i32 3, i32 4) br label %return +bb4: tail call void @ext(i32 4, i32 6) br label %return +bb5: tail call void @ext(i32 5, i32 8) br label %return +bb6: tail call void @ext(i32 6, i32 10) br label %return +bb7: tail call void @ext(i32 7, i32 12) br label %return +bb8: tail call void @ext(i32 8, i32 14) br label %return +bb9: tail call void @ext(i32 9, i32 16) br label %return +bb10: tail call void @ext(i32 1, i32 18) br label %return +bb11: tail call void @ext(i32 2, i32 20) br label %return +bb12: tail call void @ext(i32 3, i32 22) br label %return +bb13: tail call void @ext(i32 4, i32 24) br label %return +bb14: tail call void @ext(i32 5, i32 26) br label %return +bb15: tail call void @ext(i32 6, i32 28) br label %return +bb16: tail call void @ext(i32 7, i32 30) br label %return +bb17: tail call void @ext(i32 8, i32 32) br label %return + +return: ret i32 %b +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/ARM/constantpool-align.ll b/llvm/test/CodeGen/ARM/constantpool-align.ll --- a/llvm/test/CodeGen/ARM/constantpool-align.ll +++ b/llvm/test/CodeGen/ARM/constantpool-align.ll @@ -17,3 +17,28 @@ store <4 x i32> , <4 x i32>* %p, align 4 ret void } + +; CHECK-LABEL: f_pgso: +; CHECK: vld1.64 {{.*}}, [r1] +; CHECK: .p2align 3 +define void @f_pgso(<4 x i32>* %p) !prof !14 { + store <4 x i32> , <4 x i32>* %p, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll --- a/llvm/test/CodeGen/RISCV/tail-calls.ll +++ b/llvm/test/CodeGen/RISCV/tail-calls.ll @@ -23,6 +23,17 @@ ret void } +; Perform tail call optimization for external symbol. +@dest_pgso = global [2 x i8] zeroinitializer +define void @caller_extern_pgso(i8* %src) !prof !14 { +entry: +; CHECK: caller_extern_pgso +; CHECK-NOT: call memcpy +; CHECK: tail memcpy + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @dest_pgso, i32 0, i32 0), i8* %src, i32 7, i1 false) + ret void +} + ; Perform indirect tail call optimization (for function pointer call). declare void @callee_indirect1() declare void @callee_indirect2() @@ -146,3 +157,20 @@ tail call void @callee_nostruct() ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll --- a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll +++ b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll @@ -29,6 +29,13 @@ ret i32 %a } +define i32 @test_pgso(i32 %a) nounwind !prof !14 { +; CHECK: test_pgso +; CHECK: movl +; CHECK-NEXT: ret + ret i32 %a +} + define i32 @test_add(i32 %a, i32 %b) nounwind { ; CHECK: test_add ; CHECK: addl @@ -101,3 +108,19 @@ ret void } +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll --- a/llvm/test/CodeGen/X86/avx-cvt.ll +++ b/llvm/test/CodeGen/X86/avx-cvt.ll @@ -190,6 +190,16 @@ ret float %res } +define float @floor_f32_load_pgso(float* %aptr) !prof !14 { +; CHECK-LABEL: floor_f32_load_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = load float, float* %aptr + %res = call float @llvm.floor.f32(float %a) + ret float %res +} + define double @nearbyint_f64_load(double* %aptr) optsize { ; CHECK-LABEL: nearbyint_f64_load: ; CHECK: # %bb.0: @@ -200,3 +210,29 @@ ret double %res } +define double @nearbyint_f64_load_pgso(double* %aptr) !prof !14 { +; CHECK-LABEL: nearbyint_f64_load_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = load double, double* %aptr + %res = call double @llvm.nearbyint.f64(double %a) + ret double %res +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1970,6 +1970,47 @@ ret <32 x i16> %ret } +define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 { +; KNL-LABEL: test_build_vec_v32i1_pgso: +; KNL: ## %bb.0: +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_build_vec_v32i1_pgso: +; SKX: ## %bb.0: +; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_build_vec_v32i1_pgso: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_build_vec_v32i1_pgso: +; AVX512DQ: ## %bb.0: +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; X86-LABEL: test_build_vec_v32i1_pgso: +; X86: ## %bb.0: +; X86-NEXT: movl $1497715861, %eax ## imm = 0x59455495 +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl + %ret = select <32 x i1> , <32 x i16> %x, <32 x i16> zeroinitializer + ret <32 x i16> %ret +} + define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; KNL-LABEL: test_build_vec_v64i1: ; KNL: ## %bb.0: @@ -2013,12 +2054,12 @@ ; KNL-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB43_2 +; KNL-NEXT: je LBB44_2 ; KNL-NEXT: ## %bb.1: ## %L1 ; KNL-NEXT: vmovapd %zmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB43_2: ## %L2 +; KNL-NEXT: LBB44_2: ## %L2 ; KNL-NEXT: vmovapd %zmm0, 8(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -2029,12 +2070,12 @@ ; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} ; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 ; SKX-NEXT: ktestb %k0, %k1 -; SKX-NEXT: je LBB43_2 +; SKX-NEXT: je LBB44_2 ; SKX-NEXT: ## %bb.1: ## %L1 ; SKX-NEXT: vmovapd %zmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB43_2: ## %L2 +; SKX-NEXT: LBB44_2: ## %L2 ; SKX-NEXT: vmovapd %zmm0, 8(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -2046,12 +2087,12 @@ ; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: je LBB43_2 +; AVX512BW-NEXT: je LBB44_2 ; AVX512BW-NEXT: ## %bb.1: ## %L1 ; AVX512BW-NEXT: vmovapd %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB43_2: ## %L2 +; AVX512BW-NEXT: LBB44_2: ## %L2 ; AVX512BW-NEXT: vmovapd %zmm0, 8(%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2062,12 +2103,12 @@ ; AVX512DQ-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm0, %k0 ; AVX512DQ-NEXT: ktestb %k0, %k1 -; AVX512DQ-NEXT: je LBB43_2 +; AVX512DQ-NEXT: je LBB44_2 ; AVX512DQ-NEXT: ## %bb.1: ## %L1 ; AVX512DQ-NEXT: vmovapd %zmm0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB43_2: ## %L2 +; AVX512DQ-NEXT: LBB44_2: ## %L2 ; AVX512DQ-NEXT: vmovapd %zmm0, 8(%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -2079,12 +2120,12 @@ ; X86-NEXT: vmovupd 8(%eax), %zmm1 {%k1} {z} ; X86-NEXT: vcmpltpd %zmm1, %zmm0, %k0 ; X86-NEXT: ktestb %k0, %k1 -; X86-NEXT: je LBB43_2 +; X86-NEXT: je LBB44_2 ; X86-NEXT: ## %bb.1: ## %L1 ; X86-NEXT: vmovapd %zmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB43_2: ## %L2 +; X86-NEXT: LBB44_2: ## %L2 ; X86-NEXT: vmovapd %zmm0, 8(%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -2131,13 +2172,13 @@ ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: je LBB44_2 +; KNL-NEXT: je LBB45_2 ; KNL-NEXT: ## %bb.1: ## %L1 ; KNL-NEXT: vmovaps %zmm0, (%rdi) ; KNL-NEXT: vmovaps %zmm1, 64(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB44_2: ## %L2 +; KNL-NEXT: LBB45_2: ## %L2 ; KNL-NEXT: vmovaps %zmm0, 4(%rdi) ; KNL-NEXT: vmovaps %zmm1, 68(%rdi) ; KNL-NEXT: vzeroupper @@ -2154,13 +2195,13 @@ ; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; SKX-NEXT: kunpckwd %k1, %k2, %k1 ; SKX-NEXT: kortestd %k1, %k0 -; SKX-NEXT: je LBB44_2 +; SKX-NEXT: je LBB45_2 ; SKX-NEXT: ## %bb.1: ## %L1 ; SKX-NEXT: vmovaps %zmm0, (%rdi) ; SKX-NEXT: vmovaps %zmm1, 64(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB44_2: ## %L2 +; SKX-NEXT: LBB45_2: ## %L2 ; SKX-NEXT: vmovaps %zmm0, 4(%rdi) ; SKX-NEXT: vmovaps %zmm1, 68(%rdi) ; SKX-NEXT: vzeroupper @@ -2177,13 +2218,13 @@ ; AVX512BW-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; AVX512BW-NEXT: kunpckwd %k1, %k2, %k1 ; AVX512BW-NEXT: kortestd %k1, %k0 -; AVX512BW-NEXT: je LBB44_2 +; AVX512BW-NEXT: je LBB45_2 ; AVX512BW-NEXT: ## %bb.1: ## %L1 ; AVX512BW-NEXT: vmovaps %zmm0, (%rdi) ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB44_2: ## %L2 +; AVX512BW-NEXT: LBB45_2: ## %L2 ; AVX512BW-NEXT: vmovaps %zmm0, 4(%rdi) ; AVX512BW-NEXT: vmovaps %zmm1, 68(%rdi) ; AVX512BW-NEXT: vzeroupper @@ -2203,13 +2244,13 @@ ; AVX512DQ-NEXT: kmovw %k0, %ecx ; AVX512DQ-NEXT: shll $16, %ecx ; AVX512DQ-NEXT: orl %eax, %ecx -; AVX512DQ-NEXT: je LBB44_2 +; AVX512DQ-NEXT: je LBB45_2 ; AVX512DQ-NEXT: ## %bb.1: ## %L1 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi) ; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB44_2: ## %L2 +; AVX512DQ-NEXT: LBB45_2: ## %L2 ; AVX512DQ-NEXT: vmovaps %zmm0, 4(%rdi) ; AVX512DQ-NEXT: vmovaps %zmm1, 68(%rdi) ; AVX512DQ-NEXT: vzeroupper @@ -2227,13 +2268,13 @@ ; X86-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; X86-NEXT: kunpckwd %k1, %k2, %k1 ; X86-NEXT: kortestd %k1, %k0 -; X86-NEXT: je LBB44_2 +; X86-NEXT: je LBB45_2 ; X86-NEXT: ## %bb.1: ## %L1 ; X86-NEXT: vmovaps %zmm0, (%eax) ; X86-NEXT: vmovaps %zmm1, 64(%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB44_2: ## %L2 +; X86-NEXT: LBB45_2: ## %L2 ; X86-NEXT: vmovaps %zmm0, 4(%eax) ; X86-NEXT: vmovaps %zmm1, 68(%eax) ; X86-NEXT: vzeroupper @@ -4188,12 +4229,12 @@ ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testw %ax, %ax -; KNL-NEXT: jle LBB65_1 +; KNL-NEXT: jle LBB66_1 ; KNL-NEXT: ## %bb.2: ## %bb.2 ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB65_1: ## %bb.1 +; KNL-NEXT: LBB66_1: ## %bb.1 ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4207,12 +4248,12 @@ ; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testw %ax, %ax -; SKX-NEXT: jle LBB65_1 +; SKX-NEXT: jle LBB66_1 ; SKX-NEXT: ## %bb.2: ## %bb.2 ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB65_1: ## %bb.1 +; SKX-NEXT: LBB66_1: ## %bb.1 ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4226,12 +4267,12 @@ ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testw %ax, %ax -; AVX512BW-NEXT: jle LBB65_1 +; AVX512BW-NEXT: jle LBB66_1 ; AVX512BW-NEXT: ## %bb.2: ## %bb.2 ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB65_1: ## %bb.1 +; AVX512BW-NEXT: LBB66_1: ## %bb.1 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -4245,12 +4286,12 @@ ; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, %eax ; AVX512DQ-NEXT: testw %ax, %ax -; AVX512DQ-NEXT: jle LBB65_1 +; AVX512DQ-NEXT: jle LBB66_1 ; AVX512DQ-NEXT: ## %bb.2: ## %bb.2 ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB65_1: ## %bb.1 +; AVX512DQ-NEXT: LBB66_1: ## %bb.1 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4264,12 +4305,12 @@ ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: testw %ax, %ax -; X86-NEXT: jle LBB65_1 +; X86-NEXT: jle LBB66_1 ; X86-NEXT: ## %bb.2: ## %bb.2 ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB65_1: ## %bb.1 +; X86-NEXT: LBB66_1: ## %bb.1 ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -4297,11 +4338,11 @@ ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kortestw %k0, %k0 -; CHECK-NEXT: jb LBB66_2 +; CHECK-NEXT: jb LBB67_2 ; CHECK-NEXT: ## %bb.1: ## %bb.1 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _foo -; CHECK-NEXT: LBB66_2: ## %bb.2 +; CHECK-NEXT: LBB67_2: ## %bb.2 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -4313,11 +4354,11 @@ ; X86-NEXT: vpord %zmm1, %zmm0, %zmm0 ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kortestw %k0, %k0 -; X86-NEXT: jb LBB66_2 +; X86-NEXT: jb LBB67_2 ; X86-NEXT: ## %bb.1: ## %bb.1 ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo -; X86-NEXT: LBB66_2: ## %bb.2 +; X86-NEXT: LBB67_2: ## %bb.2 ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -4505,12 +4546,12 @@ ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB72_1 +; KNL-NEXT: je LBB73_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB72_1: ## %bar +; KNL-NEXT: LBB73_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4527,12 +4568,12 @@ ; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 ; SKX-NEXT: ktestb %k1, %k0 -; SKX-NEXT: je LBB72_1 +; SKX-NEXT: je LBB73_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB72_1: ## %bar +; SKX-NEXT: LBB73_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4555,12 +4596,12 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: je LBB72_1 +; AVX512BW-NEXT: je LBB73_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB72_1: ## %bar +; AVX512BW-NEXT: LBB73_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -4581,12 +4622,12 @@ ; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: korb %k3, %k2, %k1 ; AVX512DQ-NEXT: ktestb %k1, %k0 -; AVX512DQ-NEXT: je LBB72_1 +; AVX512DQ-NEXT: je LBB73_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB72_1: ## %bar +; AVX512DQ-NEXT: LBB73_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4603,12 +4644,12 @@ ; X86-NEXT: vptestnmd %ymm3, %ymm3, %k2 ; X86-NEXT: korb %k2, %k1, %k1 ; X86-NEXT: ktestb %k1, %k0 -; X86-NEXT: je LBB72_1 +; X86-NEXT: je LBB73_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB72_1: ## %bar +; X86-NEXT: LBB73_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -4646,12 +4687,12 @@ ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB73_1 +; KNL-NEXT: je LBB74_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB73_1: ## %bar +; KNL-NEXT: LBB74_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4668,12 +4709,12 @@ ; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 ; SKX-NEXT: ktestb %k1, %k0 -; SKX-NEXT: je LBB73_1 +; SKX-NEXT: je LBB74_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB73_1: ## %bar +; SKX-NEXT: LBB74_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4692,12 +4733,12 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: je LBB73_1 +; AVX512BW-NEXT: je LBB74_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB73_1: ## %bar +; AVX512BW-NEXT: LBB74_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -4714,12 +4755,12 @@ ; AVX512DQ-NEXT: vptestnmq %zmm3, %zmm3, %k2 ; AVX512DQ-NEXT: korb %k2, %k1, %k1 ; AVX512DQ-NEXT: ktestb %k1, %k0 -; AVX512DQ-NEXT: je LBB73_1 +; AVX512DQ-NEXT: je LBB74_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB73_1: ## %bar +; AVX512DQ-NEXT: LBB74_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4736,12 +4777,12 @@ ; X86-NEXT: vptestnmq %zmm3, %zmm3, %k2 ; X86-NEXT: korb %k2, %k1, %k1 ; X86-NEXT: ktestb %k1, %k0 -; X86-NEXT: je LBB73_1 +; X86-NEXT: je LBB74_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB73_1: ## %bar +; X86-NEXT: LBB74_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -4778,12 +4819,12 @@ ; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: je LBB74_1 +; KNL-NEXT: je LBB75_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB74_1: ## %bar +; KNL-NEXT: LBB75_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4800,12 +4841,12 @@ ; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; SKX-NEXT: korw %k2, %k1, %k1 ; SKX-NEXT: ktestw %k1, %k0 -; SKX-NEXT: je LBB74_1 +; SKX-NEXT: je LBB75_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB74_1: ## %bar +; SKX-NEXT: LBB75_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4823,12 +4864,12 @@ ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kortestw %k0, %k0 -; AVX512BW-NEXT: je LBB74_1 +; AVX512BW-NEXT: je LBB75_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB74_1: ## %bar +; AVX512BW-NEXT: LBB75_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -4845,12 +4886,12 @@ ; AVX512DQ-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; AVX512DQ-NEXT: korw %k2, %k1, %k1 ; AVX512DQ-NEXT: ktestw %k1, %k0 -; AVX512DQ-NEXT: je LBB74_1 +; AVX512DQ-NEXT: je LBB75_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB74_1: ## %bar +; AVX512DQ-NEXT: LBB75_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4867,12 +4908,12 @@ ; X86-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; X86-NEXT: korw %k2, %k1, %k1 ; X86-NEXT: ktestw %k1, %k0 -; X86-NEXT: je LBB74_1 +; X86-NEXT: je LBB75_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB74_1: ## %bar +; X86-NEXT: LBB75_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -4928,12 +4969,12 @@ ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: je LBB75_1 +; KNL-NEXT: je LBB76_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB75_1: ## %bar +; KNL-NEXT: LBB76_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4950,12 +4991,12 @@ ; SKX-NEXT: vptestnmw %zmm3, %zmm3, %k2 ; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: ktestd %k1, %k0 -; SKX-NEXT: je LBB75_1 +; SKX-NEXT: je LBB76_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB75_1: ## %bar +; SKX-NEXT: LBB76_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4972,12 +5013,12 @@ ; AVX512BW-NEXT: vptestnmw %zmm3, %zmm3, %k2 ; AVX512BW-NEXT: kord %k2, %k1, %k1 ; AVX512BW-NEXT: ktestd %k1, %k0 -; AVX512BW-NEXT: je LBB75_1 +; AVX512BW-NEXT: je LBB76_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB75_1: ## %bar +; AVX512BW-NEXT: LBB76_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -5014,12 +5055,12 @@ ; AVX512DQ-NEXT: kmovw %k0, %ecx ; AVX512DQ-NEXT: shll $16, %ecx ; AVX512DQ-NEXT: orl %eax, %ecx -; AVX512DQ-NEXT: je LBB75_1 +; AVX512DQ-NEXT: je LBB76_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB75_1: ## %bar +; AVX512DQ-NEXT: LBB76_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -5036,12 +5077,12 @@ ; X86-NEXT: vptestnmw %zmm3, %zmm3, %k2 ; X86-NEXT: kord %k2, %k1, %k1 ; X86-NEXT: ktestd %k1, %k0 -; X86-NEXT: je LBB75_1 +; X86-NEXT: je LBB76_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB75_1: ## %bar +; X86-NEXT: LBB76_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -5121,12 +5162,12 @@ ; KNL-NEXT: orl %eax, %edx ; KNL-NEXT: shlq $32, %rdx ; KNL-NEXT: orq %rcx, %rdx -; KNL-NEXT: je LBB76_1 +; KNL-NEXT: je LBB77_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB76_1: ## %bar +; KNL-NEXT: LBB77_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -5143,12 +5184,12 @@ ; SKX-NEXT: vptestnmb %zmm3, %zmm3, %k2 ; SKX-NEXT: korq %k2, %k1, %k1 ; SKX-NEXT: ktestq %k1, %k0 -; SKX-NEXT: je LBB76_1 +; SKX-NEXT: je LBB77_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB76_1: ## %bar +; SKX-NEXT: LBB77_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -5165,12 +5206,12 @@ ; AVX512BW-NEXT: vptestnmb %zmm3, %zmm3, %k2 ; AVX512BW-NEXT: korq %k2, %k1, %k1 ; AVX512BW-NEXT: ktestq %k1, %k0 -; AVX512BW-NEXT: je LBB76_1 +; AVX512BW-NEXT: je LBB77_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB76_1: ## %bar +; AVX512BW-NEXT: LBB77_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -5231,12 +5272,12 @@ ; AVX512DQ-NEXT: orl %eax, %edx ; AVX512DQ-NEXT: shlq $32, %rdx ; AVX512DQ-NEXT: orq %rcx, %rdx -; AVX512DQ-NEXT: je LBB76_1 +; AVX512DQ-NEXT: je LBB77_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB76_1: ## %bar +; AVX512DQ-NEXT: LBB77_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -5255,12 +5296,12 @@ ; X86-NEXT: kandq %k1, %k0, %k0 ; X86-NEXT: kshiftrq $32, %k0, %k1 ; X86-NEXT: kortestd %k1, %k0 -; X86-NEXT: je LBB76_1 +; X86-NEXT: je LBB77_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB76_1: ## %bar +; X86-NEXT: LBB77_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -5360,3 +5401,20 @@ %maskv = insertelement <64 x i1> , i1 %a_i, i32 0 ret <64 x i1> %maskv } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll --- a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll +++ b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll @@ -130,6 +130,24 @@ ret i64 %div } +define i64 @div64_pgso(i64 %a, i64 %b) !prof !15 { +; CHECK-LABEL: div64_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: cqto +; CHECK-NEXT: idivq %rsi +; CHECK-NEXT: retq +; +; HUGEWS-LABEL: div64_pgso: +; HUGEWS: # %bb.0: +; HUGEWS-NEXT: movq %rdi, %rax +; HUGEWS-NEXT: cqto +; HUGEWS-NEXT: idivq %rsi +; HUGEWS-NEXT: retq + %div = sdiv i64 %a, %b + ret i64 %div +} + define i64 @div64_hugews(i64 %a, i64 %b) { ; ATOM-LABEL: div64_hugews: ; ATOM: # %bb.0: @@ -137,12 +155,12 @@ ; ATOM-NEXT: movq %rdi, %rax ; ATOM-NEXT: orq %rsi, %rcx ; ATOM-NEXT: shrq $32, %rcx -; ATOM-NEXT: je .LBB3_1 +; ATOM-NEXT: je .LBB4_1 ; ATOM-NEXT: # %bb.2: ; ATOM-NEXT: cqto ; ATOM-NEXT: idivq %rsi ; ATOM-NEXT: retq -; ATOM-NEXT: .LBB3_1: +; ATOM-NEXT: .LBB4_1: ; ATOM-NEXT: # kill: def $eax killed $eax killed $rax ; ATOM-NEXT: xorl %edx, %edx ; ATOM-NEXT: divl %esi @@ -155,12 +173,12 @@ ; SLM-NEXT: movq %rdi, %rax ; SLM-NEXT: orq %rsi, %rcx ; SLM-NEXT: shrq $32, %rcx -; SLM-NEXT: je .LBB3_1 +; SLM-NEXT: je .LBB4_1 ; SLM-NEXT: # %bb.2: ; SLM-NEXT: cqto ; SLM-NEXT: idivq %rsi ; SLM-NEXT: retq -; SLM-NEXT: .LBB3_1: +; SLM-NEXT: .LBB4_1: ; SLM-NEXT: xorl %edx, %edx ; SLM-NEXT: # kill: def $eax killed $eax killed $rax ; SLM-NEXT: divl %esi @@ -173,12 +191,12 @@ ; SKL-NEXT: movq %rdi, %rcx ; SKL-NEXT: orq %rsi, %rcx ; SKL-NEXT: shrq $32, %rcx -; SKL-NEXT: je .LBB3_1 +; SKL-NEXT: je .LBB4_1 ; SKL-NEXT: # %bb.2: ; SKL-NEXT: cqto ; SKL-NEXT: idivq %rsi ; SKL-NEXT: retq -; SKL-NEXT: .LBB3_1: +; SKL-NEXT: .LBB4_1: ; SKL-NEXT: # kill: def $eax killed $eax killed $rax ; SKL-NEXT: xorl %edx, %edx ; SKL-NEXT: divl %esi @@ -213,6 +231,24 @@ ret i32 %div } +define i32 @div32_pgso(i32 %a, i32 %b) !prof !15 { +; CHECK-LABEL: div32_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cltd +; CHECK-NEXT: idivl %esi +; CHECK-NEXT: retq +; +; HUGEWS-LABEL: div32_pgso: +; HUGEWS: # %bb.0: +; HUGEWS-NEXT: movl %edi, %eax +; HUGEWS-NEXT: cltd +; HUGEWS-NEXT: idivl %esi +; HUGEWS-NEXT: retq + %div = sdiv i32 %a, %b + ret i32 %div +} + define i32 @div32_minsize(i32 %a, i32 %b) minsize { ; CHECK-LABEL: div32_minsize: ; CHECK: # %bb.0: @@ -246,3 +282,4 @@ !12 = !{i32 10000, i64 1000, i32 1} !13 = !{i32 999000, i64 1000, i32 3} !14 = !{i32 999999, i64 5, i32 3} +!15 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll --- a/llvm/test/CodeGen/X86/cmov-into-branch.ll +++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll @@ -88,7 +88,7 @@ ; CHECK-NEXT: cmovnel %edi, %eax ; CHECK-NEXT: retq %cmp = icmp ne i32 %a, 0 - %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15 ret i32 %sel } @@ -104,7 +104,7 @@ ; CHECK-NEXT: .LBB6_2: # %select.end ; CHECK-NEXT: retq %cmp = icmp ne i32 %a, 0 - %sel = select i1 %cmp, i32 %a, i32 %b, !prof !1 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16 ret i32 %sel } @@ -124,7 +124,7 @@ ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: retq %cmp = icmp ne i32 %a, 0 - %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17 ret i32 %sel } @@ -137,12 +137,51 @@ ; CHECK-NEXT: cmovnel %edi, %eax ; CHECK-NEXT: retq %cmp = icmp ne i32 %a, 0 - %sel = select i1 %cmp, i32 %a, i32 %b, !prof !3 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !18 ret i32 %sel } -!0 = !{!"branch_weights", i32 1, i32 99} -!1 = !{!"branch_weights", i32 1, i32 100} -!2 = !{!"branch_weights", i32 100, i32 1} -!3 = !{!"branch_weights", i32 0, i32 0} +define i32 @weighted_select_optsize(i32 %a, i32 %b) optsize { +; CHECK-LABEL: weighted_select_optsize: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ne i32 %a, 0 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16 + ret i32 %sel +} + +define i32 @weighted_select_pgso(i32 %a, i32 %b) !prof !14 { +; CHECK-LABEL: weighted_select_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ne i32 %a, 0 + %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16 + ret i32 %sel +} +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} +!15 = !{!"branch_weights", i32 1, i32 99} +!16 = !{!"branch_weights", i32 1, i32 100} +!17 = !{!"branch_weights", i32 100, i32 1} +!18 = !{!"branch_weights", i32 0, i32 0} diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll @@ -0,0 +1,242 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-linux -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32 +; RUN: llc < %s -mtriple=x86_64-linux -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64 +; RUN: llc < %s -mtriple=x86_64-win32 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=WIN64 + +declare void @foo() +declare void @bar() + +define void @f(i32 %x, i32 %y) !prof !14 { +; CHECK32-LABEL: f: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; CHECK32-NEXT: cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x08] +; CHECK32-NEXT: jne bar # TAILCALL +; CHECK32-NEXT: # encoding: [0x75,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; CHECK32-NEXT: # %bb.1: # %bb1 +; CHECK32-NEXT: jmp foo # TAILCALL +; CHECK32-NEXT: # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; +; CHECK64-LABEL: f: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: cmpl %esi, %edi # encoding: [0x39,0xf7] +; CHECK64-NEXT: jne bar # TAILCALL +; CHECK64-NEXT: # encoding: [0x75,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; CHECK64-NEXT: # %bb.1: # %bb1 +; CHECK64-NEXT: jmp foo # TAILCALL +; CHECK64-NEXT: # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; +; WIN64-LABEL: f: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: cmpl %edx, %ecx # encoding: [0x39,0xd1] +; WIN64-NEXT: jne bar # TAILCALL +; WIN64-NEXT: # encoding: [0x75,A] +; WIN64-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; WIN64-NEXT: # %bb.1: # %bb1 +; WIN64-NEXT: jmp foo # TAILCALL +; WIN64-NEXT: # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +entry: + %p = icmp eq i32 %x, %y + br i1 %p, label %bb1, label %bb2 +bb1: + tail call void @foo() + ret void +bb2: + tail call void @bar() + ret void + +; Check that the asm doesn't just look good, but uses the correct encoding. +} + +define void @f_non_leaf(i32 %x, i32 %y) !prof !14 { +; CHECK32-LABEL: f_non_leaf: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: pushl %ebx # encoding: [0x53] +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: .cfi_offset %ebx, -8 +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; CHECK32-NEXT: #APP +; CHECK32-NEXT: #NO_APP +; CHECK32-NEXT: cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x0c] +; CHECK32-NEXT: jne .LBB1_2 # encoding: [0x75,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1 +; CHECK32-NEXT: # %bb.1: # %bb1 +; CHECK32-NEXT: popl %ebx # encoding: [0x5b] +; CHECK32-NEXT: .cfi_def_cfa_offset 4 +; CHECK32-NEXT: jmp foo # TAILCALL +; CHECK32-NEXT: # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; CHECK32-NEXT: .LBB1_2: # %bb2 +; CHECK32-NEXT: .cfi_def_cfa_offset 8 +; CHECK32-NEXT: popl %ebx # encoding: [0x5b] +; CHECK32-NEXT: .cfi_def_cfa_offset 4 +; CHECK32-NEXT: jmp bar # TAILCALL +; CHECK32-NEXT: # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; +; CHECK64-LABEL: f_non_leaf: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: pushq %rbx # encoding: [0x53] +; CHECK64-NEXT: .cfi_def_cfa_offset 16 +; CHECK64-NEXT: .cfi_offset %rbx, -16 +; CHECK64-NEXT: #APP +; CHECK64-NEXT: #NO_APP +; CHECK64-NEXT: cmpl %esi, %edi # encoding: [0x39,0xf7] +; CHECK64-NEXT: jne .LBB1_2 # encoding: [0x75,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1 +; CHECK64-NEXT: # %bb.1: # %bb1 +; CHECK64-NEXT: popq %rbx # encoding: [0x5b] +; CHECK64-NEXT: .cfi_def_cfa_offset 8 +; CHECK64-NEXT: jmp foo # TAILCALL +; CHECK64-NEXT: # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; CHECK64-NEXT: .LBB1_2: # %bb2 +; CHECK64-NEXT: .cfi_def_cfa_offset 16 +; CHECK64-NEXT: popq %rbx # encoding: [0x5b] +; CHECK64-NEXT: .cfi_def_cfa_offset 8 +; CHECK64-NEXT: jmp bar # TAILCALL +; CHECK64-NEXT: # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; +; WIN64-LABEL: f_non_leaf: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: pushq %rbx # encoding: [0x53] +; WIN64-NEXT: .seh_pushreg %rbx +; WIN64-NEXT: .seh_endprologue +; WIN64-NEXT: #APP +; WIN64-NEXT: #NO_APP +; WIN64-NEXT: cmpl %edx, %ecx # encoding: [0x39,0xd1] +; WIN64-NEXT: jne .LBB1_2 # encoding: [0x75,A] +; WIN64-NEXT: # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1 +; WIN64-NEXT: # %bb.1: # %bb1 +; WIN64-NEXT: popq %rbx # encoding: [0x5b] +; WIN64-NEXT: jmp foo # TAILCALL +; WIN64-NEXT: # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1 +; WIN64-NEXT: .LBB1_2: # %bb2 +; WIN64-NEXT: nop # encoding: [0x90] +; WIN64-NEXT: popq %rbx # encoding: [0x5b] +; WIN64-NEXT: jmp bar # TAILCALL +; WIN64-NEXT: # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1 +; WIN64-NEXT: .seh_handlerdata +; WIN64-NEXT: .text +; WIN64-NEXT: .seh_endproc +entry: + ; Force %ebx to be spilled on the stack, turning this into + ; not a "leaf" function for Win64. + tail call void asm sideeffect "", "~{ebx}"() + + %p = icmp eq i32 %x, %y + br i1 %p, label %bb1, label %bb2 +bb1: + tail call void @foo() + ret void +bb2: + tail call void @bar() + ret void + +} + +declare x86_thiscallcc zeroext i1 @baz(i8*, i32) +define x86_thiscallcc zeroext i1 @BlockPlacementTest(i8* %this, i32 %x) !prof !14 { +; CHECK32-LABEL: BlockPlacementTest: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] +; CHECK32-NEXT: testb $42, %dl # encoding: [0xf6,0xc2,0x2a] +; CHECK32-NEXT: je .LBB2_3 # encoding: [0x74,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1 +; CHECK32-NEXT: # %bb.1: # %land.rhs +; CHECK32-NEXT: movb $1, %al # encoding: [0xb0,0x01] +; CHECK32-NEXT: testb $44, %dl # encoding: [0xf6,0xc2,0x2c] +; CHECK32-NEXT: je baz # TAILCALL +; CHECK32-NEXT: # encoding: [0x74,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1 +; CHECK32-NEXT: .LBB2_2: # %land.end +; CHECK32-NEXT: # kill: def $al killed $al killed $eax +; CHECK32-NEXT: retl $4 # encoding: [0xc2,0x04,0x00] +; CHECK32-NEXT: .LBB2_3: +; CHECK32-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK32-NEXT: jmp .LBB2_2 # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1 +; +; CHECK64-LABEL: BlockPlacementTest: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: testb $42, %sil # encoding: [0x40,0xf6,0xc6,0x2a] +; CHECK64-NEXT: je .LBB2_3 # encoding: [0x74,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1 +; CHECK64-NEXT: # %bb.1: # %land.rhs +; CHECK64-NEXT: movb $1, %al # encoding: [0xb0,0x01] +; CHECK64-NEXT: testb $44, %sil # encoding: [0x40,0xf6,0xc6,0x2c] +; CHECK64-NEXT: je baz # TAILCALL +; CHECK64-NEXT: # encoding: [0x74,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1 +; CHECK64-NEXT: .LBB2_2: # %land.end +; CHECK64-NEXT: # kill: def $al killed $al killed $eax +; CHECK64-NEXT: retq # encoding: [0xc3] +; CHECK64-NEXT: .LBB2_3: +; CHECK64-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; CHECK64-NEXT: jmp .LBB2_2 # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1 +; +; WIN64-LABEL: BlockPlacementTest: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: testb $42, %dl # encoding: [0xf6,0xc2,0x2a] +; WIN64-NEXT: je .LBB2_3 # encoding: [0x74,A] +; WIN64-NEXT: # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1 +; WIN64-NEXT: # %bb.1: # %land.rhs +; WIN64-NEXT: movb $1, %al # encoding: [0xb0,0x01] +; WIN64-NEXT: testb $44, %dl # encoding: [0xf6,0xc2,0x2c] +; WIN64-NEXT: je baz # TAILCALL +; WIN64-NEXT: # encoding: [0x74,A] +; WIN64-NEXT: # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1 +; WIN64-NEXT: .LBB2_2: # %land.end +; WIN64-NEXT: # kill: def $al killed $al killed $eax +; WIN64-NEXT: retq # encoding: [0xc3] +; WIN64-NEXT: .LBB2_3: +; WIN64-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; WIN64-NEXT: jmp .LBB2_2 # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1 +entry: + %and = and i32 %x, 42 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %land.end, label %land.rhs + +land.rhs: + %and6 = and i32 %x, 44 + %tobool7 = icmp eq i32 %and6, 0 + br i1 %tobool7, label %lor.rhs, label %land.end + +lor.rhs: + %call = tail call x86_thiscallcc zeroext i1 @baz(i8* %this, i32 %x) #2 + br label %land.end + +land.end: + %0 = phi i1 [ false, %entry ], [ true, %land.rhs ], [ %call, %lor.rhs ] + ret i1 %0 + +; Make sure machine block placement isn't confused by the conditional tail call, +; but sees that it can fall through to the next block. +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/fixup-lea.ll b/llvm/test/CodeGen/X86/fixup-lea.ll --- a/llvm/test/CodeGen/X86/fixup-lea.ll +++ b/llvm/test/CodeGen/X86/fixup-lea.ll @@ -108,17 +108,96 @@ ret void } +define void @foo_pgso(i32 inreg %dns) !prof !14 { +; SLOW-LABEL: foo_pgso: +; SLOW: # %bb.0: # %entry +; SLOW-NEXT: xorl %ecx, %ecx +; SLOW-NEXT: decl %ecx +; SLOW-NEXT: .LBB4_1: # %for.body +; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 +; SLOW-NEXT: movzwl %cx, %edx +; SLOW-NEXT: decl %ecx +; SLOW-NEXT: cmpl %eax, %edx +; SLOW-NEXT: jl .LBB4_1 +; SLOW-NEXT: # %bb.2: # %for.end +; SLOW-NEXT: retl +; +; FAST-LABEL: foo_pgso: +; FAST: # %bb.0: # %entry +; FAST-NEXT: xorl %ecx, %ecx +; FAST-NEXT: decl %ecx +; FAST-NEXT: .LBB4_1: # %for.body +; FAST-NEXT: # =>This Inner Loop Header: Depth=1 +; FAST-NEXT: movzwl %cx, %edx +; FAST-NEXT: addl $-1, %ecx +; FAST-NEXT: cmpl %eax, %edx +; FAST-NEXT: jl .LBB4_1 +; FAST-NEXT: # %bb.2: # %for.end +; FAST-NEXT: retl +entry: + br label %for.body + +for.body: + %i.05 = phi i16 [ %dec, %for.body ], [ 0, %entry ] + %dec = add i16 %i.05, -1 + %conv = zext i16 %dec to i32 + %cmp = icmp slt i32 %conv, %dns + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret void +} + +define void @bar_pgso(i32 inreg %dns) !prof !14 { +; SLOW-LABEL: bar_pgso: +; SLOW: # %bb.0: # %entry +; SLOW-NEXT: xorl %ecx, %ecx +; SLOW-NEXT: incl %ecx +; SLOW-NEXT: .LBB5_1: # %for.body +; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 +; SLOW-NEXT: movzwl %cx, %edx +; SLOW-NEXT: incl %ecx +; SLOW-NEXT: cmpl %eax, %edx +; SLOW-NEXT: jl .LBB5_1 +; SLOW-NEXT: # %bb.2: # %for.end +; SLOW-NEXT: retl +; +; FAST-LABEL: bar_pgso: +; FAST: # %bb.0: # %entry +; FAST-NEXT: xorl %ecx, %ecx +; FAST-NEXT: incl %ecx +; FAST-NEXT: .LBB5_1: # %for.body +; FAST-NEXT: # =>This Inner Loop Header: Depth=1 +; FAST-NEXT: movzwl %cx, %edx +; FAST-NEXT: addl $1, %ecx +; FAST-NEXT: cmpl %eax, %edx +; FAST-NEXT: jl .LBB5_1 +; FAST-NEXT: # %bb.2: # %for.end +; FAST-NEXT: retl +entry: + br label %for.body + +for.body: + %i.05 = phi i16 [ %inc, %for.body ], [ 0, %entry ] + %inc = add i16 %i.05, 1 + %conv = zext i16 %inc to i32 + %cmp = icmp slt i32 %conv, %dns + br i1 %cmp, label %for.body, label %for.end +for.end: + ret void +} + define void @foo_nosize(i32 inreg %dns) { ; SLOW-LABEL: foo_nosize: ; SLOW: # %bb.0: # %entry ; SLOW-NEXT: movw $-1, %cx ; SLOW-NEXT: .p2align 4, 0x90 -; SLOW-NEXT: .LBB4_1: # %for.body +; SLOW-NEXT: .LBB6_1: # %for.body ; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 ; SLOW-NEXT: movzwl %cx, %edx ; SLOW-NEXT: decl %ecx ; SLOW-NEXT: cmpl %eax, %edx -; SLOW-NEXT: jl .LBB4_1 +; SLOW-NEXT: jl .LBB6_1 ; SLOW-NEXT: # %bb.2: # %for.end ; SLOW-NEXT: retl ; @@ -126,12 +205,12 @@ ; FAST: # %bb.0: # %entry ; FAST-NEXT: movw $-1, %cx ; FAST-NEXT: .p2align 4, 0x90 -; FAST-NEXT: .LBB4_1: # %for.body +; FAST-NEXT: .LBB6_1: # %for.body ; FAST-NEXT: # =>This Inner Loop Header: Depth=1 ; FAST-NEXT: movzwl %cx, %edx ; FAST-NEXT: addl $-1, %ecx ; FAST-NEXT: cmpl %eax, %edx -; FAST-NEXT: jl .LBB4_1 +; FAST-NEXT: jl .LBB6_1 ; FAST-NEXT: # %bb.2: # %for.end ; FAST-NEXT: retl entry: @@ -153,12 +232,12 @@ ; SLOW: # %bb.0: # %entry ; SLOW-NEXT: movw $1, %cx ; SLOW-NEXT: .p2align 4, 0x90 -; SLOW-NEXT: .LBB5_1: # %for.body +; SLOW-NEXT: .LBB7_1: # %for.body ; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 ; SLOW-NEXT: movzwl %cx, %edx ; SLOW-NEXT: incl %ecx ; SLOW-NEXT: cmpl %eax, %edx -; SLOW-NEXT: jl .LBB5_1 +; SLOW-NEXT: jl .LBB7_1 ; SLOW-NEXT: # %bb.2: # %for.end ; SLOW-NEXT: retl ; @@ -166,12 +245,12 @@ ; FAST: # %bb.0: # %entry ; FAST-NEXT: movw $1, %cx ; FAST-NEXT: .p2align 4, 0x90 -; FAST-NEXT: .LBB5_1: # %for.body +; FAST-NEXT: .LBB7_1: # %for.body ; FAST-NEXT: # =>This Inner Loop Header: Depth=1 ; FAST-NEXT: movzwl %cx, %edx ; FAST-NEXT: addl $1, %ecx ; FAST-NEXT: cmpl %eax, %edx -; FAST-NEXT: jl .LBB5_1 +; FAST-NEXT: jl .LBB7_1 ; FAST-NEXT: # %bb.2: # %for.end ; FAST-NEXT: retl entry: @@ -186,3 +265,20 @@ for.end: ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll --- a/llvm/test/CodeGen/X86/fold-load-unops.ll +++ b/llvm/test/CodeGen/X86/fold-load-unops.ll @@ -113,6 +113,38 @@ ret <4 x float> %res } +define float @rcpss_pgso(float* %a) !prof !14 { +; SSE-LABEL: rcpss_pgso: +; SSE: # %bb.0: +; SSE-NEXT: rcpss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: rcpss_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load float, float* %a + %ins = insertelement <4 x float> undef, float %ld, i32 0 + %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins) + %ext = extractelement <4 x float> %res, i32 0 + ret float %ext +} + +define <4 x float> @rcpss_full_pgso(<4 x float>* %a) !prof !14 { +; SSE-LABEL: rcpss_full_pgso: +; SSE: # %bb.0: +; SSE-NEXT: rcpss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: rcpss_full_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x float>, <4 x float>* %a + %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld) + ret <4 x float> %res +} + define float @rsqrtss_size(float* %a) optsize { ; SSE-LABEL: rsqrtss_size: ; SSE: # %bb.0: @@ -145,6 +177,38 @@ ret <4 x float> %res } +define float @rsqrtss_pgso(float* %a) !prof !14 { +; SSE-LABEL: rsqrtss_pgso: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: rsqrtss_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load float, float* %a + %ins = insertelement <4 x float> undef, float %ld, i32 0 + %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins) + %ext = extractelement <4 x float> %res, i32 0 + ret float %ext +} + +define <4 x float> @rsqrtss_full_pgso(<4 x float>* %a) !prof !14 { +; SSE-LABEL: rsqrtss_full_pgso: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: rsqrtss_full_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x float>, <4 x float>* %a + %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld) + ret <4 x float> %res +} + define float @sqrtss_size(float* %a) optsize{ ; SSE-LABEL: sqrtss_size: ; SSE: # %bb.0: @@ -196,6 +260,57 @@ ret <4 x float> %res } +define float @sqrtss_pgso(float* %a) !prof !14 { +; SSE-LABEL: sqrtss_pgso: +; SSE: # %bb.0: +; SSE-NEXT: sqrtss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtss_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load float, float* %a + %ins = insertelement <4 x float> undef, float %ld, i32 0 + %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins) + %ext = extractelement <4 x float> %res, i32 0 + ret float %ext +} + +define <4 x float> @sqrtss_full_pgso(<4 x float>* %a) !prof !14 { +; SSE-LABEL: sqrtss_full_pgso: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtss_full_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <4 x float>, <4 x float>* %a + %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) + ret <4 x float> %res +} + +define <4 x float> @sqrtss_full_pgso_volatile(<4 x float>* %a) !prof !14 { +; SSE-LABEL: sqrtss_full_pgso_volatile: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtss_full_pgso_volatile: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load volatile <4 x float>, <4 x float>* %a + %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) + ret <4 x float> %res +} + define double @sqrtsd_size(double* %a) optsize { ; SSE-LABEL: sqrtsd_size: ; SSE: # %bb.0: @@ -247,7 +362,75 @@ ret <2 x double> %res } +define double @sqrtsd_pgso(double* %a) !prof !14 { +; SSE-LABEL: sqrtsd_pgso: +; SSE: # %bb.0: +; SSE-NEXT: sqrtsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtsd_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load double, double* %a + %ins = insertelement <2 x double> undef, double %ld, i32 0 + %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins) + %ext = extractelement <2 x double> %res, i32 0 + ret double %ext +} + +define <2 x double> @sqrtsd_full_pgso(<2 x double>* %a) !prof !14 { +; SSE-LABEL: sqrtsd_full_pgso: +; SSE: # %bb.0: +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: sqrtsd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtsd_full_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load <2 x double>, <2 x double>* %a + %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) + ret <2 x double> %res +} + +define <2 x double> @sqrtsd_full_pgso_volatile(<2 x double>* %a) !prof !14 { +; SSE-LABEL: sqrtsd_full_pgso_volatile: +; SSE: # %bb.0: +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: sqrtsd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtsd_full_pgso_volatile: +; AVX: # %bb.0: +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %ld = load volatile <2 x double>, <2 x double>* %a + %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) + ret <2 x double> %res +} + declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -196,6 +196,26 @@ ret i32 %tmp } +define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 { +; X86-LABEL: var_shift_i32_pgso: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: var_shift_i32_pgso: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shldl %cl, %esi, %eax +; X64-NEXT: retq + %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) + ret i32 %tmp +} + define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-FAST-LABEL: var_shift_i64: ; X86-FAST: # %bb.0: @@ -216,36 +236,36 @@ ; X86-FAST-NEXT: shll %cl, %edi ; X86-FAST-NEXT: shldl %cl, %eax, %ebp ; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: je .LBB4_2 +; X86-FAST-NEXT: je .LBB5_2 ; X86-FAST-NEXT: # %bb.1: ; X86-FAST-NEXT: movl %edi, %ebp ; X86-FAST-NEXT: xorl %edi, %edi -; X86-FAST-NEXT: .LBB4_2: +; X86-FAST-NEXT: .LBB5_2: ; X86-FAST-NEXT: movb $64, %cl ; X86-FAST-NEXT: subb %bl, %cl ; X86-FAST-NEXT: movl %edx, %esi ; X86-FAST-NEXT: shrl %cl, %esi ; X86-FAST-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: jne .LBB4_3 +; X86-FAST-NEXT: jne .LBB5_3 ; X86-FAST-NEXT: # %bb.4: ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-FAST-NEXT: testl %ebx, %ebx -; X86-FAST-NEXT: jne .LBB4_6 -; X86-FAST-NEXT: jmp .LBB4_7 -; X86-FAST-NEXT: .LBB4_3: +; X86-FAST-NEXT: jne .LBB5_6 +; X86-FAST-NEXT: jmp .LBB5_7 +; X86-FAST-NEXT: .LBB5_3: ; X86-FAST-NEXT: movl %esi, %ecx ; X86-FAST-NEXT: xorl %esi, %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: testl %ebx, %ebx -; X86-FAST-NEXT: je .LBB4_7 -; X86-FAST-NEXT: .LBB4_6: +; X86-FAST-NEXT: je .LBB5_7 +; X86-FAST-NEXT: .LBB5_6: ; X86-FAST-NEXT: orl %esi, %ebp ; X86-FAST-NEXT: orl %ecx, %edi ; X86-FAST-NEXT: movl %edi, %eax ; X86-FAST-NEXT: movl %ebp, %edx -; X86-FAST-NEXT: .LBB4_7: +; X86-FAST-NEXT: .LBB5_7: ; X86-FAST-NEXT: addl $4, %esp ; X86-FAST-NEXT: popl %esi ; X86-FAST-NEXT: popl %edi @@ -279,11 +299,11 @@ ; X86-SLOW-NEXT: testb %dl, %dl ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: je .LBB4_2 +; X86-SLOW-NEXT: je .LBB5_2 ; X86-SLOW-NEXT: # %bb.1: ; X86-SLOW-NEXT: orl %eax, %ebp ; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: .LBB4_2: +; X86-SLOW-NEXT: .LBB5_2: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SLOW-NEXT: movl %ebp, %eax ; X86-SLOW-NEXT: movl %ebx, %ecx @@ -294,41 +314,41 @@ ; X86-SLOW-NEXT: negb %cl ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb %ch, %ch -; X86-SLOW-NEXT: je .LBB4_4 +; X86-SLOW-NEXT: je .LBB5_4 ; X86-SLOW-NEXT: # %bb.3: ; X86-SLOW-NEXT: orl %edi, %eax ; X86-SLOW-NEXT: movl %eax, %ebp -; X86-SLOW-NEXT: .LBB4_4: +; X86-SLOW-NEXT: .LBB5_4: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movl %eax, %edi ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shll %cl, %edi ; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: je .LBB4_6 +; X86-SLOW-NEXT: je .LBB5_6 ; X86-SLOW-NEXT: # %bb.5: ; X86-SLOW-NEXT: movl %edi, %ebp ; X86-SLOW-NEXT: xorl %edi, %edi -; X86-SLOW-NEXT: .LBB4_6: +; X86-SLOW-NEXT: .LBB5_6: ; X86-SLOW-NEXT: movb %dh, %cl ; X86-SLOW-NEXT: shrl %cl, %esi ; X86-SLOW-NEXT: testb $32, %dh -; X86-SLOW-NEXT: jne .LBB4_7 +; X86-SLOW-NEXT: jne .LBB5_7 ; X86-SLOW-NEXT: # %bb.8: ; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: jne .LBB4_10 -; X86-SLOW-NEXT: jmp .LBB4_11 -; X86-SLOW-NEXT: .LBB4_7: +; X86-SLOW-NEXT: jne .LBB5_10 +; X86-SLOW-NEXT: jmp .LBB5_11 +; X86-SLOW-NEXT: .LBB5_7: ; X86-SLOW-NEXT: movl %esi, %ecx ; X86-SLOW-NEXT: xorl %esi, %esi ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: je .LBB4_11 -; X86-SLOW-NEXT: .LBB4_10: +; X86-SLOW-NEXT: je .LBB5_11 +; X86-SLOW-NEXT: .LBB5_10: ; X86-SLOW-NEXT: orl %esi, %ebp ; X86-SLOW-NEXT: orl %ecx, %edi ; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl %edi, %eax -; X86-SLOW-NEXT: .LBB4_11: +; X86-SLOW-NEXT: .LBB5_11: ; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SLOW-NEXT: addl $8, %esp ; X86-SLOW-NEXT: popl %esi @@ -503,3 +523,20 @@ %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 7) ret i64 %tmp } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -195,6 +195,26 @@ ret i32 %tmp } +define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 { +; X86-LABEL: var_shift_i32_pgso: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrdl %cl, %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: var_shift_i32_pgso: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %ecx +; X64-NEXT: movl %esi, %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shrdl %cl, %edi, %eax +; X64-NEXT: retq + %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) + ret i32 %tmp +} + define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-FAST-LABEL: var_shift_i64: ; X86-FAST: # %bb.0: @@ -216,30 +236,30 @@ ; X86-FAST-NEXT: shll %cl, %edi ; X86-FAST-NEXT: shldl %cl, %eax, %esi ; X86-FAST-NEXT: testb $32, %cl -; X86-FAST-NEXT: je .LBB4_2 +; X86-FAST-NEXT: je .LBB5_2 ; X86-FAST-NEXT: # %bb.1: ; X86-FAST-NEXT: movl %edi, %esi ; X86-FAST-NEXT: xorl %edi, %edi -; X86-FAST-NEXT: .LBB4_2: +; X86-FAST-NEXT: .LBB5_2: ; X86-FAST-NEXT: movl %edx, %ebp ; X86-FAST-NEXT: movl %ebx, %ecx ; X86-FAST-NEXT: shrl %cl, %ebp ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-FAST-NEXT: shrdl %cl, %edx, %eax ; X86-FAST-NEXT: testb $32, %bl -; X86-FAST-NEXT: je .LBB4_4 +; X86-FAST-NEXT: je .LBB5_4 ; X86-FAST-NEXT: # %bb.3: ; X86-FAST-NEXT: movl %ebp, %eax ; X86-FAST-NEXT: xorl %ebp, %ebp -; X86-FAST-NEXT: .LBB4_4: +; X86-FAST-NEXT: .LBB5_4: ; X86-FAST-NEXT: testl %ebx, %ebx -; X86-FAST-NEXT: je .LBB4_6 +; X86-FAST-NEXT: je .LBB5_6 ; X86-FAST-NEXT: # %bb.5: ; X86-FAST-NEXT: orl %ebp, %esi ; X86-FAST-NEXT: orl %eax, %edi ; X86-FAST-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-FAST-NEXT: movl %esi, %edx -; X86-FAST-NEXT: .LBB4_6: +; X86-FAST-NEXT: .LBB5_6: ; X86-FAST-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-FAST-NEXT: addl $4, %esp ; X86-FAST-NEXT: popl %esi @@ -274,11 +294,11 @@ ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb %ch, %ch ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: je .LBB4_2 +; X86-SLOW-NEXT: je .LBB5_2 ; X86-SLOW-NEXT: # %bb.1: ; X86-SLOW-NEXT: orl %edi, %edx ; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: .LBB4_2: +; X86-SLOW-NEXT: .LBB5_2: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shrl %cl, %edx @@ -290,41 +310,41 @@ ; X86-SLOW-NEXT: shll %cl, %edi ; X86-SLOW-NEXT: testb %ah, %ah ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: je .LBB4_4 +; X86-SLOW-NEXT: je .LBB5_4 ; X86-SLOW-NEXT: # %bb.3: ; X86-SLOW-NEXT: orl %edx, %edi ; X86-SLOW-NEXT: movl %edi, %ebp -; X86-SLOW-NEXT: .LBB4_4: +; X86-SLOW-NEXT: .LBB5_4: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: testb $32, %bl -; X86-SLOW-NEXT: je .LBB4_6 +; X86-SLOW-NEXT: je .LBB5_6 ; X86-SLOW-NEXT: # %bb.5: ; X86-SLOW-NEXT: movl %edi, %ebp ; X86-SLOW-NEXT: xorl %edi, %edi -; X86-SLOW-NEXT: .LBB4_6: +; X86-SLOW-NEXT: .LBB5_6: ; X86-SLOW-NEXT: movl %eax, %ecx ; X86-SLOW-NEXT: shll %cl, %esi ; X86-SLOW-NEXT: testb $32, %al ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: jne .LBB4_7 +; X86-SLOW-NEXT: jne .LBB5_7 ; X86-SLOW-NEXT: # %bb.8: ; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: jne .LBB4_10 -; X86-SLOW-NEXT: jmp .LBB4_11 -; X86-SLOW-NEXT: .LBB4_7: +; X86-SLOW-NEXT: jne .LBB5_10 +; X86-SLOW-NEXT: jmp .LBB5_11 +; X86-SLOW-NEXT: .LBB5_7: ; X86-SLOW-NEXT: movl %esi, %eax ; X86-SLOW-NEXT: xorl %esi, %esi ; X86-SLOW-NEXT: testl %ebx, %ebx -; X86-SLOW-NEXT: je .LBB4_11 -; X86-SLOW-NEXT: .LBB4_10: +; X86-SLOW-NEXT: je .LBB5_11 +; X86-SLOW-NEXT: .LBB5_10: ; X86-SLOW-NEXT: orl %ebp, %esi ; X86-SLOW-NEXT: orl %edi, %eax ; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl %eax, %edx -; X86-SLOW-NEXT: .LBB4_11: +; X86-SLOW-NEXT: .LBB5_11: ; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SLOW-NEXT: addl $8, %esp ; X86-SLOW-NEXT: popl %esi @@ -498,3 +518,20 @@ %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 7) ret i64 %tmp } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -1983,6 +1983,80 @@ ret float %x230 } +define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 { +; SSE3-LABEL: hadd32_4_pgso: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm1 +; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: addps %xmm0, %xmm1 +; SSE3-NEXT: haddps %xmm1, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; AVX-LABEL: hadd32_4_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> + %x227 = fadd <4 x float> %x225, %x226 + %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> + %x229 = fadd <4 x float> %x227, %x228 + %x230 = extractelement <4 x float> %x229, i32 0 + ret float %x230 +} + +define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 { +; SSE3-LABEL: hadd32_8_pgso: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm1 +; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: addps %xmm0, %xmm1 +; SSE3-NEXT: haddps %xmm1, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; AVX-LABEL: hadd32_8_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> + %x227 = fadd <8 x float> %x225, %x226 + %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> + %x229 = fadd <8 x float> %x227, %x228 + %x230 = extractelement <8 x float> %x229, i32 0 + ret float %x230 +} + +define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 { +; SSE3-LABEL: hadd32_16_pgso: +; SSE3: # %bb.0: +; SSE3-NEXT: movaps %xmm0, %xmm1 +; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: addps %xmm0, %xmm1 +; SSE3-NEXT: haddps %xmm1, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; AVX-LABEL: hadd32_16_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> + %x227 = fadd <16 x float> %x225, %x226 + %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> + %x229 = fadd <16 x float> %x227, %x228 + %x230 = extractelement <16 x float> %x229, i32 0 + ret float %x230 +} + define float @partial_reduction_fadd_v8f32(<8 x float> %x) { ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32: ; SSE3-SLOW: # %bb.0: @@ -2115,3 +2189,20 @@ %r = extractelement <16 x float> %x0123, i32 0 ret float %r } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/immediate_merging.ll b/llvm/test/CodeGen/X86/immediate_merging.ll --- a/llvm/test/CodeGen/X86/immediate_merging.ll +++ b/llvm/test/CodeGen/X86/immediate_merging.ll @@ -73,6 +73,68 @@ ret i32 0 } +; Test PGSO to make sure immediates with multiple users don't get pulled in to +; instructions. +define i32 @foo_pgso() !prof !14 { +; X86-LABEL: foo_pgso: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $1234, %eax # imm = 0x4D2 +; X86-NEXT: movl %eax, a +; X86-NEXT: movl %eax, b +; X86-NEXT: movl $12, %eax +; X86-NEXT: movl %eax, c +; X86-NEXT: cmpl %eax, e +; X86-NEXT: jne .LBB1_2 +; X86-NEXT: # %bb.1: # %if.then +; X86-NEXT: movl $1, x +; X86-NEXT: .LBB1_2: # %if.end +; X86-NEXT: movl $1234, f # imm = 0x4D2 +; X86-NEXT: movl $555, %eax # imm = 0x22B +; X86-NEXT: movl %eax, h +; X86-NEXT: addl %eax, i +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl +; +; X64-LABEL: foo_pgso: +; X64: # %bb.0: # %entry +; X64-NEXT: movl $1234, %eax # imm = 0x4D2 +; X64-NEXT: movl %eax, {{.*}}(%rip) +; X64-NEXT: movl %eax, {{.*}}(%rip) +; X64-NEXT: movl $12, %eax +; X64-NEXT: movl %eax, {{.*}}(%rip) +; X64-NEXT: cmpl %eax, {{.*}}(%rip) +; X64-NEXT: jne .LBB1_2 +; X64-NEXT: # %bb.1: # %if.then +; X64-NEXT: movl $1, {{.*}}(%rip) +; X64-NEXT: .LBB1_2: # %if.end +; X64-NEXT: movl $1234, {{.*}}(%rip) # imm = 0x4D2 +; X64-NEXT: movl $555, %eax # imm = 0x22B +; X64-NEXT: movl %eax, {{.*}}(%rip) +; X64-NEXT: addl %eax, {{.*}}(%rip) +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +entry: + store i32 1234, i32* @a + store i32 1234, i32* @b + store i32 12, i32* @c + %0 = load i32, i32* @e + %cmp = icmp eq i32 %0, 12 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 1, i32* @x + br label %if.end + +; New block.. Make sure 1234 isn't live across basic blocks from before. +if.end: ; preds = %if.then, %entry + store i32 1234, i32* @f + store i32 555, i32* @h + %1 = load i32, i32* @i + %add1 = add nsw i32 %1, 555 + store i32 %add1, i32* @i + ret i32 0 +} + ; Test -O2 to make sure that all immediates get pulled in to their users. define i32 @foo2() { ; X86-LABEL: foo2: @@ -124,3 +186,47 @@ call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false) ret void } + +; memset gets lowered in DAG. Constant merging should hoist all the +; immediates used to store to the individual memory locations. Make +; sure we don't directly store the immediates. +define void @foomemset_pgso() !prof !14 { +; X86-LABEL: foomemset_pgso: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $555819297, %eax # imm = 0x21212121 +; X86-NEXT: movl %eax, AA+20 +; X86-NEXT: movl %eax, AA+16 +; X86-NEXT: movl %eax, AA+12 +; X86-NEXT: movl %eax, AA+8 +; X86-NEXT: movl %eax, AA+4 +; X86-NEXT: movl %eax, AA +; X86-NEXT: retl +; +; X64-LABEL: foomemset_pgso: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $2387225703656530209, %rax # imm = 0x2121212121212121 +; X64-NEXT: movq %rax, AA+{{.*}}(%rip) +; X64-NEXT: movq %rax, AA+{{.*}}(%rip) +; X64-NEXT: movq %rax, {{.*}}(%rip) +; X64-NEXT: retq +entry: + call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false) + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/immediate_merging64.ll b/llvm/test/CodeGen/X86/immediate_merging64.ll --- a/llvm/test/CodeGen/X86/immediate_merging64.ll +++ b/llvm/test/CodeGen/X86/immediate_merging64.ll @@ -19,6 +19,19 @@ ret i1 %cmp } +define i1 @imm_multiple_users_pgso(i64 %a, i64* %b) !prof !14 { +; CHECK-LABEL: imm_multiple_users_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: movq %rax, (%rsi) +; CHECK-NEXT: cmpq %rax, %rdi +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq + store i64 -1, i64* %b, align 8 + %cmp = icmp eq i64 %a, -1 + ret i1 %cmp +} + declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) ; Inlined memsets requiring multiple same-sized stores should be lowered using @@ -34,3 +47,31 @@ tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false) ret void } + +define void @memset_zero_pgso(i8* noalias nocapture %D) !prof !14 { +; CHECK-LABEL: memset_zero_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movq %rax, 7(%rdi) +; CHECK-NEXT: movq %rax, (%rdi) +; CHECK-NEXT: retq + tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false) + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/loop-blocks.ll b/llvm/test/CodeGen/X86/loop-blocks.ll --- a/llvm/test/CodeGen/X86/loop-blocks.ll +++ b/llvm/test/CodeGen/X86/loop-blocks.ll @@ -269,6 +269,35 @@ attributes #0 = { minsize norecurse nounwind optsize readnone uwtable } +; CHECK-LABEL: slightly_more_involved_2_pgso: +; CHECK-NOT: jmp .LBB6_1 +; CHECK: .LBB6_1: +; CHECK-NEXT: callq body + +define void @slightly_more_involved_2_pgso() norecurse nounwind readnone uwtable !prof !14 { +entry: + br label %loop + +loop: + call void @body() + %t0 = call i32 @get() + %t1 = icmp slt i32 %t0, 2 + br i1 %t1, label %block_a, label %bb + +bb: + %t2 = call i32 @get() + %t3 = icmp slt i32 %t2, 99 + br i1 %t3, label %exit, label %loop + +block_a: + call void @bar99() + br label %loop + +exit: + call void @exit() + ret void +} + declare void @bar99() nounwind declare void @bar100() nounwind declare void @bar101() nounwind @@ -281,3 +310,20 @@ declare void @block_a_true_func() nounwind declare void @block_a_false_func() nounwind declare void @block_a_merge_func() nounwind + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/materialize.ll b/llvm/test/CodeGen/X86/materialize.ll --- a/llvm/test/CodeGen/X86/materialize.ll +++ b/llvm/test/CodeGen/X86/materialize.ll @@ -30,6 +30,21 @@ ; CHECK64-NEXT: retq } +define i32 @one32_pgso() !prof !14 { +entry: + ret i32 1 + +; CHECK32-LABEL: one32_pgso: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: retl + +; FIXME: Figure out the best approach in 64-bit mode. +; CHECK64-LABEL: one32_pgso: +; CHECK64: movl $1, %eax +; CHECK64-NEXT: retq +} + define i32 @one32_minsize() minsize { entry: ret i32 1 @@ -107,6 +122,16 @@ ; CHECK32-NEXT: retl } +define i32 @minus_one32_pgso() !prof !14 { +entry: + ret i32 -1 + +; CHECK32-LABEL: minus_one32_pgso: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: retl +} + define i32 @minus_one32_minsize() minsize { entry: ret i32 -1 @@ -140,6 +165,28 @@ ; CHECK32-NEXT: retl } +define i16 @one16_pgso() !prof !14 { +entry: + ret i16 1 + +; CHECK32-LABEL: one16_pgso: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: # kill +; CHECK32-NEXT: retl +} + +define i16 @minus_one16_pgso() !prof !14 { +entry: + ret i16 -1 + +; CHECK32-LABEL: minus_one16_pgso: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: # kill +; CHECK32-NEXT: retl +} + define i32 @minus_five32() minsize { entry: ret i32 -5 @@ -213,4 +260,72 @@ ; CHECK32: retl } +define i32 @rematerialize_minus_one_pgso() !prof !14 { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; -1 should be re-materialized here instead of getting spilled above. + ret i32 -1 + +; CHECK32-LABEL: rematerialize_minus_one_pgso +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NOT: %eax +; CHECK32: retl +} + +define i32 @rematerialize_minus_one_eflags_pgso(i32 %x) !prof !14 { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; Define eflags. + %a = icmp ne i32 %x, 123 + %b = zext i1 %a to i32 + ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags. + ; It must therefore not use the xor-dec lowering. + %c = select i1 %a, i32 %b, i32 -1 + ret i32 %c + +; CHECK32-LABEL: rematerialize_minus_one_eflags_pgso +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: cmpl +; CHECK32: setne +; CHECK32-NOT: xorl +; CHECK32: movl $-1 +; CHECK32: cmov +; CHECK32: retl +} + declare x86_thiscallcc void @f(i32) + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll @@ -0,0 +1,1064 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 + +declare i32 @memcmp(i8*, i8*, i64) +declare i32 @bcmp(i8*, i8*, i64) + +define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: length2: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length2_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpw (%rsi), %ax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length2_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_const: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length2_eq_nobuiltin_attr: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: movl $2, %edx +; X64-NEXT: callq memcmp +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: popq %rcx +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length3: +; X86: # %bb.0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # %bb.2: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB4_3 +; X86-NEXT: .LBB4_1: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB4_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length3: +; X64: # %bb.0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: cmpw %cx, %ax +; X64-NEXT: jne .LBB4_1 +; X64-NEXT: # %bb.2: # %loadbb1 +; X64-NEXT: movzbl 2(%rdi), %eax +; X64-NEXT: movzbl 2(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB4_1: # %res_block +; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length3_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: xorw (%eax), %dx +; X86-NEXT: movb 2(%ecx), %cl +; X86-NEXT: xorb 2(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orw %dx, %ax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length3_eq: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: xorw (%rsi), %ax +; X64-NEXT: movb 2(%rdi), %cl +; X64-NEXT: xorb 2(%rsi), %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: orw %ax, %cx +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: retl +; +; X64-LABEL: length4: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: movl (%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: seta %al +; X64-NEXT: sbbl $0, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length4_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: cmpl (%rsi), %eax +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length4_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length4_eq_const: +; X64: # %bb.0: +; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length5: +; X86: # %bb.0: # %loadbb +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # %bb.2: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB9_3 +; X86-NEXT: .LBB9_1: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB9_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length5: +; X64: # %bb.0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: jne .LBB9_1 +; X64-NEXT: # %bb.2: # %loadbb1 +; X64-NEXT: movzbl 4(%rdi), %eax +; X64-NEXT: movzbl 4(%rsi), %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB9_1: # %res_block +; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length5_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: movb 4(%ecx), %cl +; X86-NEXT: xorb 4(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length5_eq: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: xorl (%rsi), %eax +; X64-NEXT: movb 4(%rdi), %cl +; X64-NEXT: xorb 4(%rsi), %cl +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length8: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB11_3 +; X86-NEXT: .LBB11_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB11_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: length8: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: seta %al +; X64-NEXT: sbbl $0, %eax +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length8_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: length8_eq: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: cmpq (%rsi), %rax +; X64-NEXT: sete %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length8_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-NEXT: xorl (%eax), %ecx +; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-NEXT: xorl 4(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length8_eq_const: +; X64: # %bb.0: +; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 +; X64-NEXT: cmpq %rax, (%rdi) +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length12_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: length12_eq: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: xorq (%rsi), %rax +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: xorl 8(%rsi), %ecx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: setne %al +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length12: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length12: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB15_2 +; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: movl 8(%rdi), %ecx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: bswapl %ecx +; X64-NEXT: bswapl %edx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: je .LBB15_3 +; X64-NEXT: .LBB15_2: # %res_block +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: setae %al +; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB15_3: # %endblock +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind + ret i32 %m +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length16: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length16: +; X64: # %bb.0: +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # %bb.1: # %loadbb1 +; X64-NEXT: movq 8(%rdi), %rcx +; X64-NEXT: movq 8(%rsi), %rdx +; X64-NEXT: bswapq %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: je .LBB16_3 +; X64-NEXT: .LBB16_2: # %res_block +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpq %rdx, %rcx +; X64-NEXT: setae %al +; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB16_3: # %endblock +; X64-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length16_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length16_eq: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: setne %al +; X64-AVX-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind !prof !14 { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length16_eq_const: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length16_eq_const: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 + +define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length24: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length24: +; X64: # %bb.0: +; X64-NEXT: movl $24, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind + ret i32 %m +} + +define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-NOSSE-LABEL: length24_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length24_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pmovmskb %xmm2, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length24_eq: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X64-AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; X64-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_eq_const(i8* %X) nounwind !prof !14 { +; X86-NOSSE-LABEL: length24_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length24_eq_const: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: length24_eq_const: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X64-AVX-NEXT: vpxor {{.*}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vptest %xmm0, %xmm0 +; X64-AVX-NEXT: setne %al +; X64-AVX-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length32: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length32: +; X64: # %bb.0: +; X64-NEXT: movl $32, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movdqu (%rsi), %xmm2 +; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0 +; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length32_eq: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vptest %ymm0, %ymm0 +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind !prof !14 { +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $0 +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $16, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: length32_eq_const: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length32_eq_const: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: vptest %ymm0, %ymm0 +; X64-AVX1-NEXT: setne %al +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length32_eq_const: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length64: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: retl +; +; X64-LABEL: length64: +; X64: # %bb.0: +; X64-NEXT: movl $64, %edx +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-LABEL: length64_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length64_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: setne %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length64_eq: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vptest %ymm0, %ymm0 +; X64-AVX1-NEXT: setne %al +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 +; X64-AVX2-NEXT: setne %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length64_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $0 +; X86-NEXT: pushl $64 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: length64_eq_const: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movl $.L.str, %esi +; X64-SSE2-NEXT: movl $64, %edx +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: length64_eq_const: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vptest %ymm0, %ymm0 +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: length64_eq_const: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vptest %ymm0, %ymm0 +; X64-AVX2-NEXT: sete %al +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: bcmp_length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: bcmp_length2: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: retq + %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind + ret i32 %m +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll --- a/llvm/test/CodeGen/X86/memcpy.ll +++ b/llvm/test/CodeGen/X86/memcpy.ll @@ -139,6 +139,36 @@ ret void } +define void @test3_pgso(i8* nocapture %A, i8* nocapture %B) nounwind noredzone !prof !14 { +; LINUX-LABEL: test3_pgso: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: movl $64, %edx +; LINUX-NEXT: jmp memcpy # TAILCALL +; +; DARWIN-LABEL: test3_pgso: +; DARWIN: ## %bb.0: ## %entry +; DARWIN-NEXT: movq 56(%rsi), %rax +; DARWIN-NEXT: movq %rax, 56(%rdi) +; DARWIN-NEXT: movq 48(%rsi), %rax +; DARWIN-NEXT: movq %rax, 48(%rdi) +; DARWIN-NEXT: movq 40(%rsi), %rax +; DARWIN-NEXT: movq %rax, 40(%rdi) +; DARWIN-NEXT: movq 32(%rsi), %rax +; DARWIN-NEXT: movq %rax, 32(%rdi) +; DARWIN-NEXT: movq 24(%rsi), %rax +; DARWIN-NEXT: movq %rax, 24(%rdi) +; DARWIN-NEXT: movq 16(%rsi), %rax +; DARWIN-NEXT: movq %rax, 16(%rdi) +; DARWIN-NEXT: movq (%rsi), %rax +; DARWIN-NEXT: movq 8(%rsi), %rcx +; DARWIN-NEXT: movq %rcx, 8(%rdi) +; DARWIN-NEXT: movq %rax, (%rdi) +; DARWIN-NEXT: retq +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) + ret void +} + define void @test3_minsize(i8* nocapture %A, i8* nocapture %B) nounwind minsize noredzone { ; DARWIN-LABEL: test3_minsize: ; DARWIN: ## %bb.0: @@ -506,3 +536,20 @@ tail call void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* align 8 %a, i8 addrspace(256)* align 8 %b, i64 16, i1 false) ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/powi.ll b/llvm/test/CodeGen/X86/powi.ll --- a/llvm/test/CodeGen/X86/powi.ll +++ b/llvm/test/CodeGen/X86/powi.ll @@ -86,6 +86,39 @@ ret double %ret } +define double @pow_wrapper_pgso(double %a) !prof !14 { +; X86-X87-LABEL: pow_wrapper_pgso: +; X86-X87: # %bb.0: +; X86-X87-NEXT: subl $12, %esp +; X86-X87-NEXT: .cfi_def_cfa_offset 16 +; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) +; X86-X87-NEXT: fstpl (%esp) +; X86-X87-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-X87-NEXT: calll __powidf2 +; X86-X87-NEXT: addl $12, %esp +; X86-X87-NEXT: .cfi_def_cfa_offset 4 +; X86-X87-NEXT: retl +; +; X86-SSE-LABEL: pow_wrapper_pgso: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: calll __powidf2 +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; X64-LABEL: pow_wrapper_pgso: +; X64: # %bb.0: +; X64-NEXT: movl $15, %edi +; X64-NEXT: jmp __powidf2 # TAILCALL + %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; [#uses=1] + ret double %ret +} + define double @pow_wrapper_minsize(double %a) minsize { ; X86-X87-LABEL: pow_wrapper_minsize: ; X86-X87: # %bb.0: @@ -124,3 +157,19 @@ declare double @llvm.powi.f64(double, i32) nounwind readonly +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/rounding-ops.ll b/llvm/test/CodeGen/X86/rounding-ops.ll --- a/llvm/test/CodeGen/X86/rounding-ops.ll +++ b/llvm/test/CodeGen/X86/rounding-ops.ll @@ -252,3 +252,60 @@ %call = tail call double @trunc(double %x) nounwind readnone ret double %call } + +define float @test11_pgso(float* %xptr) nounwind !prof !14 { +; CHECK-SSE-LABEL: test11_pgso: +; CHECK-SSE: ## %bb.0: +; CHECK-SSE-NEXT: roundss $11, (%rdi), %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: test11_pgso: +; CHECK-AVX: ## %bb.0: +; CHECK-AVX-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512-LABEL: test11_pgso: +; CHECK-AVX512: ## %bb.0: +; CHECK-AVX512-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX512-NEXT: retq + %x = load float, float* %xptr + %call = tail call float @truncf(float %x) nounwind readnone + ret float %call +} + +define double @test12_pgso(double* %xptr) nounwind !prof !14 { +; CHECK-SSE-LABEL: test12_pgso: +; CHECK-SSE: ## %bb.0: +; CHECK-SSE-NEXT: roundsd $11, (%rdi), %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: test12_pgso: +; CHECK-AVX: ## %bb.0: +; CHECK-AVX-NEXT: vroundsd $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX-NEXT: retq +; +; CHECK-AVX512-LABEL: test12_pgso: +; CHECK-AVX512: ## %bb.0: +; CHECK-AVX512-NEXT: vroundsd $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX512-NEXT: retq + %x = load double, double* %xptr + %call = tail call double @trunc(double %x) nounwind readnone + ret double %call +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll @@ -0,0 +1,321 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s + +declare void @bar() + +define void @test1(i32* nocapture %X) nounwind !prof !14 { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $47, (%rdi) +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %tmp1 = load i32, i32* %X, align 4 + %and = and i32 %tmp1, 255 + %cmp = icmp eq i32 %and, 47 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test2(i32 %X) nounwind !prof !14 { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $47, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %and = and i32 %X, 255 + %cmp = icmp eq i32 %and, 47 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test3(i32 %X) nounwind !prof !14 { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-1, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %and = and i32 %X, 255 + %cmp = icmp eq i32 %and, 255 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; PR16083 +define i1 @test4(i64 %a, i32 %b) { +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: je .LBB3_1 +; CHECK-NEXT: # %bb.2: # %lor.end +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB3_1: # %lor.rhs +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %tobool = icmp ne i32 %b, 0 + br i1 %tobool, label %lor.end, label %lor.rhs + +lor.rhs: ; preds = %entry + %and = and i64 0, %a + %tobool1 = icmp ne i64 %and, 0 + br label %lor.end + +lor.end: ; preds = %lor.rhs, %entry + %p = phi i1 [ true, %entry ], [ %tobool1, %lor.rhs ] + ret i1 %p +} + +@x = global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 1 }, align 4 + +; PR16551 +define void @test5(i32 %X) nounwind !prof !14 { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzbl x+{{.*}}(%rip), %eax +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movzwl x+{{.*}}(%rip), %ecx +; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: cmpl $1, %ecx +; CHECK-NEXT: jne bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %bf.load = load i56, i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4 + %bf.lshr = lshr i56 %bf.load, 32 + %bf.cast = trunc i56 %bf.lshr to i32 + %cmp = icmp ne i32 %bf.cast, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test2_1(i32 %X) nounwind !prof !14 { +; CHECK-LABEL: test2_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: cmpl $256, %eax # imm = 0x100 +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %and = and i32 %X, 255 + %cmp = icmp eq i32 %and, 256 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_1(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $1, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_47(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_47: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $47, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 47 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_127(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_127: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $127, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 127 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_neg1(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_neg1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-1, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_neg2(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_neg2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-2, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -2 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_neg127(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_neg127: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-127, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -127 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_neg128(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_neg128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpb $-128, %dil +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -128 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +define void @test_sext_i8_icmp_255(i8 %x) nounwind !prof !14 { +; CHECK-LABEL: test_sext_i8_icmp_255: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je bar # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: retq +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 255 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/slow-incdec.ll b/llvm/test/CodeGen/X86/slow-incdec.ll --- a/llvm/test/CodeGen/X86/slow-incdec.ll +++ b/llvm/test/CodeGen/X86/slow-incdec.ll @@ -54,6 +54,26 @@ ret i32 %r } +define i32 @inc_pgso(i32 %x) !prof !14 { +; CHECK-LABEL: inc_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: incl %eax +; CHECK-NEXT: retl + %r = add i32 %x, 1 + ret i32 %r +} + +define i32 @dec_pgso(i32 %x) !prof !14 { +; CHECK-LABEL: dec_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: decl %eax +; CHECK-NEXT: retl + %r = add i32 %x, -1 + ret i32 %r +} + declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) declare void @other(i32* ) nounwind; @@ -62,20 +82,20 @@ ; INCDEC: # %bb.0: # %entry ; INCDEC-NEXT: movl {{[0-9]+}}(%esp), %eax ; INCDEC-NEXT: incl (%eax) -; INCDEC-NEXT: jne .LBB4_1 +; INCDEC-NEXT: jne .LBB6_1 ; INCDEC-NEXT: # %bb.2: # %if.end4 ; INCDEC-NEXT: jmp other # TAILCALL -; INCDEC-NEXT: .LBB4_1: # %return +; INCDEC-NEXT: .LBB6_1: # %return ; INCDEC-NEXT: retl ; ; ADD-LABEL: cond_ae_to_cond_ne: ; ADD: # %bb.0: # %entry ; ADD-NEXT: movl {{[0-9]+}}(%esp), %eax ; ADD-NEXT: addl $1, (%eax) -; ADD-NEXT: jne .LBB4_1 +; ADD-NEXT: jne .LBB6_1 ; ADD-NEXT: # %bb.2: # %if.end4 ; ADD-NEXT: jmp other # TAILCALL -; ADD-NEXT: .LBB4_1: # %return +; ADD-NEXT: .LBB6_1: # %return ; ADD-NEXT: retl entry: %t0 = load i32, i32* %p, align 8 @@ -109,10 +129,10 @@ ; INCDEC-NEXT: incb a ; INCDEC-NEXT: sete d ; INCDEC-NEXT: testb %al, %al -; INCDEC-NEXT: jne .LBB5_2 +; INCDEC-NEXT: jne .LBB7_2 ; INCDEC-NEXT: # %bb.1: # %then ; INCDEC-NEXT: jmp external_a # TAILCALL -; INCDEC-NEXT: .LBB5_2: # %else +; INCDEC-NEXT: .LBB7_2: # %else ; INCDEC-NEXT: jmp external_b # TAILCALL ; ; ADD-LABEL: test_tail_call: @@ -123,10 +143,10 @@ ; ADD-NEXT: addb $1, a ; ADD-NEXT: sete d ; ADD-NEXT: testb %al, %al -; ADD-NEXT: jne .LBB5_2 +; ADD-NEXT: jne .LBB7_2 ; ADD-NEXT: # %bb.1: # %then ; ADD-NEXT: jmp external_a # TAILCALL -; ADD-NEXT: .LBB5_2: # %else +; ADD-NEXT: .LBB7_2: # %else ; ADD-NEXT: jmp external_b # TAILCALL entry: %val = load i32, i32* %ptr @@ -152,3 +172,19 @@ ret void } +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll --- a/llvm/test/CodeGen/X86/splat-for-size.ll +++ b/llvm/test/CodeGen/X86/splat-for-size.ll @@ -17,6 +17,17 @@ ret <2 x double> %add } +define <2 x double> @splat_v2f64_pgso(<2 x double> %x) !prof !14 { +; CHECK-LABEL: splat_v2f64_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] +; CHECK-NEXT: # xmm1 = mem[0,0] +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %add = fadd <2 x double> %x, + ret <2 x double> %add +} + define <4 x double> @splat_v4f64(<4 x double> %x) #1 { ; CHECK-LABEL: splat_v4f64: ; CHECK: # %bb.0: @@ -27,6 +38,16 @@ ret <4 x double> %add } +define <4 x double> @splat_v4f64_pgso(<4 x double> %x) !prof !14 { +; CHECK-LABEL: splat_v4f64_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %add = fadd <4 x double> %x, + ret <4 x double> %add +} + define <4 x float> @splat_v4f32(<4 x float> %x) #0 { ; CHECK-LABEL: splat_v4f32: ; CHECK: # %bb.0: @@ -37,6 +58,16 @@ ret <4 x float> %add } +define <4 x float> @splat_v4f32_pgso(<4 x float> %x) !prof !14 { +; CHECK-LABEL: splat_v4f32_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %add = fadd <4 x float> %x, + ret <4 x float> %add +} + define <8 x float> @splat_v8f32(<8 x float> %x) #1 { ; CHECK-LABEL: splat_v8f32: ; CHECK: # %bb.0: @@ -47,6 +78,16 @@ ret <8 x float> %add } +define <8 x float> @splat_v8f32_pgso(<8 x float> %x) !prof !14 { +; CHECK-LABEL: splat_v8f32_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %add = fadd <8 x float> %x, + ret <8 x float> %add +} + ; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value. ; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq. define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 { @@ -66,6 +107,23 @@ ret <2 x i64> %add } +define <2 x i64> @splat_v2i64_pgso(<2 x i64> %x) !prof !14 { +; AVX-LABEL: splat_v2i64_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [2,2] +; AVX-NEXT: # xmm1 = mem[0,0] +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v2i64_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %add = add <2 x i64> %x, + ret <2 x i64> %add +} + ; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors, ; and then we fake it: use vmovddup to splat 64-bit value. define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 { @@ -88,6 +146,26 @@ ret <4 x i64> %add } +define <4 x i64> @splat_v4i64_pgso(<4 x i64> %x) !prof !14 { +; AVX-LABEL: splat_v4i64_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [2,2] +; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v4i64_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %add = add <4 x i64> %x, + ret <4 x i64> %add +} + ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 { ; AVX-LABEL: splat_v4i32: @@ -105,6 +183,22 @@ ret <4 x i32> %add } +define <4 x i32> @splat_v4i32_pgso(<4 x i32> %x) !prof !14 { +; AVX-LABEL: splat_v4i32_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v4i32_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %add = add <4 x i32> %x, + ret <4 x i32> %add +} + ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 { ; AVX-LABEL: splat_v8i32: @@ -125,6 +219,25 @@ ret <8 x i32> %add } +define <8 x i32> @splat_v8i32_pgso(<8 x i32> %x) !prof !14 { +; AVX-LABEL: splat_v8i32_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2] +; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v8i32_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %add = add <8 x i32> %x, + ret <8 x i32> %add +} + ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 { ; AVX-LABEL: splat_v8i16: @@ -141,6 +254,21 @@ ret <8 x i16> %add } +define <8 x i16> @splat_v8i16_pgso(<8 x i16> %x) !prof !14 { +; AVX-LABEL: splat_v8i16_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v8i16_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %add = add <8 x i16> %x, + ret <8 x i16> %add +} + ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 { ; AVX-LABEL: splat_v16i16: @@ -161,6 +289,25 @@ ret <16 x i16> %add } +define <16 x i16> @splat_v16i16_pgso(<16 x i16> %x) !prof !14 { +; AVX-LABEL: splat_v16i16_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v16i16_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %add = add <16 x i16> %x, + ret <16 x i16> %add +} + ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 { ; AVX-LABEL: splat_v16i8: @@ -177,6 +324,21 @@ ret <16 x i8> %add } +define <16 x i8> @splat_v16i8_pgso(<16 x i8> %x) !prof !14 { +; AVX-LABEL: splat_v16i8_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v16i8_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %add = add <16 x i8> %x, + ret <16 x i8> %add +} + ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 { ; AVX-LABEL: splat_v32i8: @@ -197,6 +359,25 @@ ret <32 x i8> %add } +define <32 x i8> @splat_v32i8_pgso(<32 x i8> %x) !prof !14 { +; AVX-LABEL: splat_v32i8_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: splat_v32i8_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %add = add <32 x i8> %x, + ret <32 x i8> %add +} + ; PR23259: Verify that ISel doesn't crash with a 'fatal error in backend' ; due to a missing AVX pattern to select a v2i64 X86ISD::BROADCAST of a ; loadi64 with multiple uses. @@ -238,3 +419,20 @@ attributes #0 = { optsize } attributes #1 = { minsize } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll --- a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll +++ b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll @@ -19,6 +19,23 @@ } +define void @zero_pgso(i32* %p) !prof !14 { +; CHECK32-LABEL: zero_pgso: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl $0, (%eax) +; CHECK32-NEXT: retl +; +; CHECK64-LABEL: zero_pgso: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: movl $0, (%rdi) +; CHECK64-NEXT: retq +entry: + store i32 0, i32* %p + ret void + +} + define void @minus_one_optsize(i32* %p) optsize { ; CHECK32-LABEL: minus_one_optsize: ; CHECK32: # %bb.0: # %entry @@ -36,6 +53,22 @@ } +define void @minus_one_pgso(i32* %p) !prof !14 { +; CHECK32-LABEL: minus_one_pgso: +; CHECK32: # %bb.0: # %entry +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK32-NEXT: movl $-1, (%eax) +; CHECK32-NEXT: retl +; +; CHECK64-LABEL: minus_one_pgso: +; CHECK64: # %bb.0: # %entry +; CHECK64-NEXT: movl $-1, (%rdi) +; CHECK64-NEXT: retq +entry: + store i32 -1, i32* %p + ret void + +} define void @zero_64(i64* %p) minsize { ; CHECK32-LABEL: zero_64: @@ -244,3 +277,20 @@ store volatile i16 -1, i16* %p ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/switch-density.ll b/llvm/test/CodeGen/X86/switch-density.ll --- a/llvm/test/CodeGen/X86/switch-density.ll +++ b/llvm/test/CodeGen/X86/switch-density.ll @@ -79,3 +79,72 @@ ; CHECK: ja ; CHECK: jmpq *.LJTI } + +define void @dense_optsize(i32 %x) optsize { +entry: + switch i32 %x, label %return [ + i32 12, label %bb0 + i32 4, label %bb1 + i32 16, label %bb1 + i32 20, label %bb2 + i32 8, label %bb3 + ] +bb0: tail call void @g(i32 0) br label %return +bb1: tail call void @g(i32 1) br label %return +bb2: tail call void @g(i32 1) br label %return +bb3: tail call void @g(i32 2) br label %return +return: ret void + +; Lowered as branches. +; CHECK-LABEL: dense_optsize +; CHECK: cmpl $11 +; CHECK: cmpl $20 +; CHECK: cmpl $16 +; CHECK: cmpl $12 +; CHECK: cmpl $4 +; CHECK: cmpl $8 +; CHECK: retq +} + +define void @dense_pgso(i32 %x) !prof !14 { +entry: + switch i32 %x, label %return [ + i32 12, label %bb0 + i32 4, label %bb1 + i32 16, label %bb1 + i32 20, label %bb2 + i32 8, label %bb3 + ] +bb0: tail call void @g(i32 0) br label %return +bb1: tail call void @g(i32 1) br label %return +bb2: tail call void @g(i32 1) br label %return +bb3: tail call void @g(i32 2) br label %return +return: ret void + +; Lowered as branches. +; CHECK-LABEL: dense_pgso +; CHECK: cmpl $11 +; CHECK: cmpl $20 +; CHECK: cmpl $16 +; CHECK: cmpl $12 +; CHECK: cmpl $4 +; CHECK: cmpl $8 +; CHECK: retq +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/tail-opts.ll b/llvm/test/CodeGen/X86/tail-opts.ll --- a/llvm/test/CodeGen/X86/tail-opts.ll +++ b/llvm/test/CodeGen/X86/tail-opts.ll @@ -473,6 +473,47 @@ ret void } +define void @one_pgso(i32 %v) nounwind !prof !14 { +; CHECK-LABEL: one_pgso: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: je .LBB6_3 +; CHECK-NEXT: # %bb.1: # %bby +; CHECK-NEXT: cmpl $16, %edi +; CHECK-NEXT: je .LBB6_4 +; CHECK-NEXT: # %bb.2: # %bb7 +; CHECK-NEXT: jmp tail_call_me # TAILCALL +; CHECK-NEXT: .LBB6_3: # %bbx +; CHECK-NEXT: cmpl $128, %edi +; CHECK-NEXT: jne tail_call_me # TAILCALL +; CHECK-NEXT: .LBB6_4: # %return +; CHECK-NEXT: retq +entry: + %0 = icmp eq i32 %v, 0 + br i1 %0, label %bbx, label %bby + +bby: + switch i32 %v, label %bb7 [ + i32 16, label %return + ] + +bb7: + tail call void @tail_call_me() + ret void + +bbx: + switch i32 %v, label %bb12 [ + i32 128, label %return + ] + +bb12: + tail call void @tail_call_me() + ret void + +return: + ret void +} + ; two - Same as one, but with two instructions in the common ; tail instead of one. This is too much to be merged, given ; the optsize attribute. @@ -484,10 +525,51 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: je .LBB6_1 +; CHECK-NEXT: je .LBB7_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB6_1: # %bb7 +; CHECK-NEXT: .LBB7_1: # %bb7 +; CHECK-NEXT: movl $0, {{.*}}(%rip) +; CHECK-NEXT: movl $1, {{.*}}(%rip) +entry: + %0 = icmp eq i32 undef, 0 + br i1 %0, label %bbx, label %bby + +bby: + switch i32 undef, label %bb7 [ + i32 16, label %return + ] + +bb7: + store volatile i32 0, i32* @XYZ + store volatile i32 1, i32* @XYZ + unreachable + +bbx: + switch i32 undef, label %bb12 [ + i32 128, label %return + ] + +bb12: + store volatile i32 0, i32* @XYZ + store volatile i32 1, i32* @XYZ + unreachable + +return: + ret void +} + +define void @two_pgso() nounwind !prof !14 { +; CHECK-LABEL: two_pgso: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB8_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB8_1: # %bb7 ; CHECK-NEXT: movl $0, {{.*}}(%rip) ; CHECK-NEXT: movl $1, {{.*}}(%rip) entry: @@ -527,10 +609,10 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: je .LBB7_1 +; CHECK-NEXT: je .LBB9_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB7_1: # %bb7 +; CHECK-NEXT: .LBB9_1: # %bb7 ; CHECK-NEXT: movl $0, {{.*}}(%rip) ; CHECK-NEXT: movl $1, {{.*}}(%rip) entry: @@ -568,20 +650,20 @@ ; CHECK-LABEL: two_nosize: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: je .LBB8_3 +; CHECK-NEXT: je .LBB10_3 ; CHECK-NEXT: # %bb.1: # %bby ; CHECK-NEXT: testl %esi, %esi -; CHECK-NEXT: je .LBB8_4 +; CHECK-NEXT: je .LBB10_4 ; CHECK-NEXT: # %bb.2: # %bb7 ; CHECK-NEXT: movl $0, {{.*}}(%rip) ; CHECK-NEXT: jmp tail_call_me # TAILCALL -; CHECK-NEXT: .LBB8_3: # %bbx +; CHECK-NEXT: .LBB10_3: # %bbx ; CHECK-NEXT: cmpl $-1, %edx -; CHECK-NEXT: je .LBB8_4 +; CHECK-NEXT: je .LBB10_4 ; CHECK-NEXT: # %bb.5: # %bb12 ; CHECK-NEXT: movl $0, {{.*}}(%rip) ; CHECK-NEXT: jmp tail_call_me # TAILCALL -; CHECK-NEXT: .LBB8_4: # %return +; CHECK-NEXT: .LBB10_4: # %return ; CHECK-NEXT: retq entry: %0 = icmp eq i32 %x, 0 @@ -621,11 +703,11 @@ ; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: cmovgq %rdi, %rax ; CHECK-NEXT: testq %rsi, %rsi -; CHECK-NEXT: jle .LBB9_2 +; CHECK-NEXT: jle .LBB11_2 ; CHECK-NEXT: # %bb.1: # %bb.nph ; CHECK-NEXT: imulq %rdi, %rsi ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: .LBB9_2: # %for.end +; CHECK-NEXT: .LBB11_2: # %for.end ; CHECK-NEXT: retq entry: %cmp = icmp slt i64 %parami, 1 ; [#uses=1] @@ -654,24 +736,24 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB10_5 +; CHECK-NEXT: je .LBB12_5 ; CHECK-NEXT: # %bb.1: # %cont1 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB10_5 +; CHECK-NEXT: je .LBB12_5 ; CHECK-NEXT: # %bb.2: # %cont2 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB10_5 +; CHECK-NEXT: je .LBB12_5 ; CHECK-NEXT: # %bb.3: # %cont3 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB10_5 +; CHECK-NEXT: je .LBB12_5 ; CHECK-NEXT: # %bb.4: # %cont4 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB10_5: # %abort1 +; CHECK-NEXT: .LBB12_5: # %abort1 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq abort entry: @@ -714,27 +796,27 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB11_5 +; CHECK-NEXT: je .LBB13_5 ; CHECK-NEXT: # %bb.1: # %cont1 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB11_6 +; CHECK-NEXT: je .LBB13_6 ; CHECK-NEXT: # %bb.2: # %cont2 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB11_5 +; CHECK-NEXT: je .LBB13_5 ; CHECK-NEXT: # %bb.3: # %cont3 ; CHECK-NEXT: callq qux ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: je .LBB11_6 +; CHECK-NEXT: je .LBB13_6 ; CHECK-NEXT: # %bb.4: # %cont4 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB11_5: # %abort1 +; CHECK-NEXT: .LBB13_5: # %abort1 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq abort -; CHECK-NEXT: .LBB11_6: # %abort2 +; CHECK-NEXT: .LBB13_6: # %abort2 ; CHECK-NEXT: callq alt_abort entry: %c1 = call i1 @qux() @@ -763,3 +845,20 @@ cont4: ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/test-vs-bittest.ll b/llvm/test/CodeGen/X86/test-vs-bittest.ll --- a/llvm/test/CodeGen/X86/test-vs-bittest.ll +++ b/llvm/test/CodeGen/X86/test-vs-bittest.ll @@ -49,6 +49,30 @@ ret void } +define void @test64_pgso(i64 inreg %x) !prof !14 { +; CHECK-LABEL: test64_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jb .LBB2_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB2_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i64 %x, 2048 + %s = icmp eq i64 %t, 0 + br i1 %s, label %yes, label %no + +yes: + call void @bar() + ret void +no: + ret void +} + ; This test is identical to test64 above with only the destination of the br ; reversed. This somehow causes the two functions to get slightly different ; initial IR. One has an extra invert of the setcc. This previous caused one @@ -60,10 +84,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: je .LBB2_2 +; CHECK-NEXT: je .LBB3_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB2_2: # %no +; CHECK-NEXT: .LBB3_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -84,10 +108,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jae .LBB3_2 +; CHECK-NEXT: jae .LBB4_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB3_2: # %no +; CHECK-NEXT: .LBB4_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i64 %x, 2048 + %s = icmp eq i64 %t, 0 + br i1 %s, label %no, label %yes + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test64_pgso_2(i64 inreg %x) !prof !14 { +; CHECK-LABEL: test64_pgso_2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jae .LBB5_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB5_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -108,10 +156,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btq $32, %rdi -; CHECK-NEXT: jb .LBB4_2 +; CHECK-NEXT: jb .LBB6_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB4_2: # %no +; CHECK-NEXT: .LBB6_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -132,10 +180,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btq $32, %rdi -; CHECK-NEXT: jb .LBB5_2 +; CHECK-NEXT: jb .LBB7_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB5_2: # %no +; CHECK-NEXT: .LBB7_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i64 %x, 4294967296 + %s = icmp eq i64 %t, 0 + br i1 %s, label %yes, label %no + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test64_pgso_3(i64 inreg %x) !prof !14 { +; CHECK-LABEL: test64_pgso_3: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btq $32, %rdi +; CHECK-NEXT: jb .LBB8_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB8_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -156,10 +228,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btq $32, %rdi -; CHECK-NEXT: jae .LBB6_2 +; CHECK-NEXT: jae .LBB9_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB6_2: # %no +; CHECK-NEXT: .LBB9_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -180,10 +252,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btq $32, %rdi -; CHECK-NEXT: jae .LBB7_2 +; CHECK-NEXT: jae .LBB10_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB7_2: # %no +; CHECK-NEXT: .LBB10_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i64 %x, 4294967296 + %s = icmp eq i64 %t, 0 + br i1 %s, label %no, label %yes + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test64_pgso_4(i64 inreg %x) !prof !14 { +; CHECK-LABEL: test64_pgso_4: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btq $32, %rdi +; CHECK-NEXT: jae .LBB11_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB11_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -204,10 +300,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: jne .LBB8_2 +; CHECK-NEXT: jne .LBB12_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB8_2: # %no +; CHECK-NEXT: .LBB12_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -228,10 +324,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jb .LBB9_2 +; CHECK-NEXT: jb .LBB13_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB9_2: # %no +; CHECK-NEXT: .LBB13_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -252,10 +348,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: je .LBB10_2 +; CHECK-NEXT: je .LBB14_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB10_2: # %no +; CHECK-NEXT: .LBB14_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -276,10 +372,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jae .LBB11_2 +; CHECK-NEXT: jae .LBB15_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB11_2: # %no +; CHECK-NEXT: .LBB15_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i32 %x, 2048 + %s = icmp eq i32 %t, 0 + br i1 %s, label %no, label %yes + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test32_pgso_2(i32 inreg %x) !prof !14 { +; CHECK-LABEL: test32_pgso_2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jae .LBB16_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB16_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -300,10 +420,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: jne .LBB12_2 +; CHECK-NEXT: jne .LBB17_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB12_2: # %no +; CHECK-NEXT: .LBB17_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -324,10 +444,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jb .LBB13_2 +; CHECK-NEXT: jb .LBB18_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB13_2: # %no +; CHECK-NEXT: .LBB18_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i16 %x, 2048 + %s = icmp eq i16 %t, 0 + br i1 %s, label %yes, label %no + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test16_pgso(i16 inreg %x) !prof !14 { +; CHECK-LABEL: test16_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jb .LBB19_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB19_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -348,10 +492,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: testl $2048, %edi # imm = 0x800 -; CHECK-NEXT: je .LBB14_2 +; CHECK-NEXT: je .LBB20_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB14_2: # %no +; CHECK-NEXT: .LBB20_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -372,10 +516,34 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: btl $11, %edi -; CHECK-NEXT: jae .LBB15_2 +; CHECK-NEXT: jae .LBB21_2 ; CHECK-NEXT: # %bb.1: # %yes ; CHECK-NEXT: callq bar -; CHECK-NEXT: .LBB15_2: # %no +; CHECK-NEXT: .LBB21_2: # %no +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %t = and i16 %x, 2048 + %s = icmp eq i16 %t, 0 + br i1 %s, label %no, label %yes + +yes: + call void @bar() + ret void +no: + ret void +} + +define void @test16_pgso_2(i16 inreg %x) !prof !14 { +; CHECK-LABEL: test16_pgso_2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: btl $11, %edi +; CHECK-NEXT: jae .LBB22_2 +; CHECK-NEXT: # %bb.1: # %yes +; CHECK-NEXT: callq bar +; CHECK-NEXT: .LBB22_2: # %no ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -512,3 +680,20 @@ } declare void @bar() + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -2002,6 +2002,56 @@ ret <8 x i32> %b } +define <4 x double> @shuffle_v4f64_0zzz_pgso(<4 x double> %a) !prof !14 { +; ALL-LABEL: shuffle_v4f64_0zzz_pgso: +; ALL: # %bb.0: +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; ALL-NEXT: retq + %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> + ret <4 x double> %b +} + +define <4 x i64> @shuffle_v4i64_0zzz_pgso(<4 x i64> %a) !prof !14 { +; ALL-LABEL: shuffle_v4i64_0zzz_pgso: +; ALL: # %bb.0: +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; ALL-NEXT: retq + %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> + ret <4 x i64> %b +} + +define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 { +; AVX1OR2-LABEL: shuffle_v8f32_0zzzzzzz_pgso: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8f32_0zzzzzzz_pgso: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512VL-NEXT: retq + %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> + ret <8 x float> %b +} + +define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 { +; AVX1OR2-LABEL: shuffle_v8i32_0zzzzzzz_pgso: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_0zzzzzzz_pgso: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512VL-NEXT: retq + %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> + ret <8 x i32> %b +} + define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) { ; ALL-LABEL: unpckh_v4i64: ; ALL: # %bb.0: @@ -2022,3 +2072,19 @@ ret <4 x double> %unpckh } +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll --- a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll +++ b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll @@ -240,3 +240,140 @@ %a = xor i64 %x, 9223372036854775808 ; toggle bit 63 ret i64 %a } + +define i64 @and1_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: and1_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btrq $31, %rax +; CHECK-NEXT: retq + %a = and i64 %x, 18446744071562067967 ; clear bit 31 + ret i64 %a +} + +define i64 @and2_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: and2_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btrq $32, %rax +; CHECK-NEXT: retq + %a = and i64 %x, 18446744069414584319 ; clear bit 32 + ret i64 %a +} + +define i64 @and3_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: and3_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btrq $62, %rax +; CHECK-NEXT: retq + %a = and i64 %x, 13835058055282163711 ; clear bit 62 + ret i64 %a +} + +define i64 @and4_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: and4_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btrq $63, %rax +; CHECK-NEXT: retq + %a = and i64 %x, 9223372036854775807 ; clear bit 63 + ret i64 %a +} + +define i64 @or1_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: or1_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btsq $31, %rax +; CHECK-NEXT: retq + %a = or i64 %x, 2147483648 ; set bit 31 + ret i64 %a +} + +define i64 @or2_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: or2_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btsq $32, %rax +; CHECK-NEXT: retq + %a = or i64 %x, 4294967296 ; set bit 32 + ret i64 %a +} + +define i64 @or3_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: or3_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btsq $62, %rax +; CHECK-NEXT: retq + %a = or i64 %x, 4611686018427387904 ; set bit 62 + ret i64 %a +} + +define i64 @or4_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: or4_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btsq $63, %rax +; CHECK-NEXT: retq + %a = or i64 %x, 9223372036854775808 ; set bit 63 + ret i64 %a +} + +define i64 @xor1_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: xor1_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btcq $31, %rax +; CHECK-NEXT: retq + %a = xor i64 %x, 2147483648 ; toggle bit 31 + ret i64 %a +} + +define i64 @xor2_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: xor2_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btcq $32, %rax +; CHECK-NEXT: retq + %a = xor i64 %x, 4294967296 ; toggle bit 32 + ret i64 %a +} + +define i64 @xor3_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: xor3_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btcq $62, %rax +; CHECK-NEXT: retq + %a = xor i64 %x, 4611686018427387904 ; toggle bit 62 + ret i64 %a +} + +define i64 @xor4_pgso(i64 %x) !prof !14 { +; CHECK-LABEL: xor4_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: btcq $63, %rax +; CHECK-NEXT: retq + %a = xor i64 %x, 9223372036854775808 ; toggle bit 63 + ret i64 %a +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll --- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll +++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll @@ -50,6 +50,19 @@ ret i64 %or } +define i64 @_Z8lshift11mm_pgso(i64 %a, i64 %b) !prof !14 { +; CHECK-LABEL: _Z8lshift11mm_pgso: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shldq $11, %rsi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, 11 + %shr = lshr i64 %b, 53 + %or = or i64 %shr, %shl + ret i64 %or +} + attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } ; clang -O2 -c test2.cpp -emit-llvm -S @@ -78,3 +91,19 @@ attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll --- a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll +++ b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll @@ -25,6 +25,26 @@ ret void } +define void @f_pgso(i8* %p, i8* %q, i32* inalloca nocapture %unused) !prof !14 { +entry: + %g = alloca %struct.T, align 8 + %r = alloca i32, align 8 + store i32 0, i32* %r, align 4 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %p, i8* align 8 %q, i32 24, i1 false) + br label %while.body + +while.body: ; preds = %while.body, %entry + %load = load i32, i32* %r, align 4 + %dec = add nsw i32 %load, -1 + store i32 %dec, i32* %r, align 4 + call void @g(%struct.T* %g) + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body + ret void +} + ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #1 @@ -46,5 +66,38 @@ ; CHECK: testb %[[NE_REG]], %[[NE_REG]] ; CHECK: jne +; CHECK-LABEL: _f_pgso: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; CHECK: andl $-8, %esp +; CHECK-NOT: movl %esp, %esi +; CHECK: rep;movsl +; CHECK: leal 8(%esp), %esi + +; CHECK: decl (%esp) +; CHECK: setne %[[NE_REG:.*]] +; CHECK: pushl %esi +; CHECK: calll _g +; CHECK: addl $4, %esp +; CHECK: testb %[[NE_REG]], %[[NE_REG]] +; CHECK: jne + attributes #0 = { nounwind optsize } attributes #1 = { argmemonly nounwind } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll --- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll @@ -152,6 +152,30 @@ br label %fallthrough } +; Negative test - opt for size +define void @test6_pgso(i1 %cond, i64* %base) !prof !14 { +; CHECK-LABEL: @test6 +entry: +; CHECK: %addr = getelementptr + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br i1 %cond, label %if.then, label %fallthrough + +if.then: +; CHECK-LABEL: if.then: +; CHECK-NOT: getelementptr inbounds i8, {{.+}} 40 + %v1 = load i32, i32* %casted, align 4 + call void @foo(i32 %v1) + %cmp = icmp eq i32 %v1, 0 + br i1 %cmp, label %rare.1, label %fallthrough + +fallthrough: + ret void + +rare.1: + call void @slowpath(i32 %v1, i32* %casted) cold + br label %fallthrough +} ; Make sure sinking two copies of addressing mode into different blocks works ; when there are cold paths for each. @@ -278,3 +302,20 @@ store i1 false, i1* %G23 ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0}