Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -276,12 +276,28 @@ FeatureSSE1, FeatureFXSR]>; def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>; -def : Proc<"pentium-m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; -def : Proc<"pentium4", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR]>; -def : Proc<"pentium4m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; + +// Enable the PostRAScheduler for SSE2 and SSE3 class cpus. +// The intent is to enable it for pentium4 which is the current default +// processor in a vanilla 32-bit clang compilation when no specific +// architecture is specified. This generally gives a nice performance +// increase on silvermont, with largely neutral behavior on other +// contemporary large core processors. +// pentium-m, pentium4m, prescott and nocona are included as a preventative +// measure to avoid performance surprises, in case clang's default cpu +// changes slightly. + +def : ProcessorModel<"pentium-m", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; + +def : ProcessorModel<"pentium4", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR]>; + +def : ProcessorModel<"pentium4m", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; // Intel Quark. def : Proc<"lakemont", []>; @@ -292,10 +308,10 @@ FeatureFXSR, FeatureSlowBTMem]>; // NetBurst. -def : Proc<"prescott", - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureSlowBTMem]>; -def : Proc<"nocona", [ +def : ProcessorModel<"prescott", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureSlowBTMem]>; +def : ProcessorModel<"nocona", GenericPostRAModel, [ FeatureX87, FeatureSlowUAMem16, FeatureMMX, Index: lib/Target/X86/X86Schedule.td =================================================================== --- lib/Target/X86/X86Schedule.td +++ lib/Target/X86/X86Schedule.td @@ -633,8 +633,9 @@ // latencies. Since these latencies are not used for pipeline hazards, // they do not need to be exact. // -// The GenericModel contains no instruction itineraries. -def GenericModel : SchedMachineModel { +// The GenericX86Model contains no instruction itineraries +// and disables PostRAScheduler. +class GenericX86Model : SchedMachineModel { let IssueWidth = 4; let MicroOpBufferSize = 32; let LoadLatency = 4; @@ -643,6 +644,13 @@ let CompleteModel = 0; } +def GenericModel : GenericX86Model; + +// Define a model with the PostRAScheduler enabled. +def GenericPostRAModel : GenericX86Model { + let PostRAScheduler = 1; +} + include "X86ScheduleAtom.td" include "X86SchedSandyBridge.td" include "X86SchedHaswell.td" Index: test/CodeGen/X86/machine-cp.ll =================================================================== --- test/CodeGen/X86/machine-cp.ll +++ test/CodeGen/X86/machine-cp.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=nocona -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=nocona -verify-machineinstrs -post-RA-scheduler=false < %s | FileCheck %s ; After tail duplication, two copies in an early exit BB can be cancelled out. ; rdar://10640363 Index: test/CodeGen/X86/misched-ilp.ll =================================================================== --- test/CodeGen/X86/misched-ilp.ll +++ test/CodeGen/X86/misched-ilp.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=nocona -enable-misched -misched=ilpmax | FileCheck -check-prefix=MAX %s -; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=nocona -enable-misched -misched=ilpmin | FileCheck -check-prefix=MIN %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=nocona -enable-misched -misched=ilpmax -post-RA-scheduler=false | FileCheck -check-prefix=MAX %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=nocona -enable-misched -misched=ilpmin -post-RA-scheduler=false | FileCheck -check-prefix=MIN %s ; ; Basic verification of the ScheduleDAGILP metric. ; Index: test/CodeGen/X86/post-ra-sched.ll =================================================================== --- test/CodeGen/X86/post-ra-sched.ll +++ test/CodeGen/X86/post-ra-sched.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -mtriple=i386 -mcpu=pentium4 | FileCheck %s +; RUN: llc < %s -mtriple=i386 -mcpu=pentium4m | FileCheck %s +; RUN: llc < %s -mtriple=i386 -mcpu=pentium-m | FileCheck %s +; RUN: llc < %s -mtriple=i386 -mcpu=prescott | FileCheck %s +; RUN: llc < %s -mtriple=i386 -mcpu=nocona | FileCheck %s +; +; Verify that scheduling puts some distance between a load feeding into +; the address of another load, and that second load. This currently +; happens during the post-RA-scheduler, which should be enabled by +; default with the above specified cpus. + +@ptrs = external global [0 x i32*], align 4 +@idxa = common global i32 0, align 4 +@idxb = common global i32 0, align 4 +@res = common global i32 0, align 4 + +define void @addindirect() { +; CHECK-LABEL: addindirect: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl idxb, %ecx +; CHECK-NEXT: movl idxa, %eax +; CHECK-NEXT: movl ptrs(,%ecx,4), %ecx +; CHECK-NEXT: movl ptrs(,%eax,4), %eax +; CHECK-NEXT: movl (%ecx), %ecx +; CHECK-NEXT: addl (%eax), %ecx +; CHECK-NEXT: movl %ecx, res +; CHECK-NEXT: retl +entry: + %0 = load i32, i32* @idxa, align 4 + %arrayidx = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %0 + %1 = load i32*, i32** %arrayidx, align 4 + %2 = load i32, i32* %1, align 4 + %3 = load i32, i32* @idxb, align 4 + %arrayidx1 = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %3 + %4 = load i32*, i32** %arrayidx1, align 4 + %5 = load i32, i32* %4, align 4 + %add = add i32 %5, %2 + store i32 %add, i32* @res, align 4 + ret void +} Index: test/CodeGen/X86/pr16360.ll =================================================================== --- test/CodeGen/X86/pr16360.ll +++ test/CodeGen/X86/pr16360.ll @@ -5,9 +5,9 @@ ; CHECK-LABEL: foo: ; CHECK: # BB#0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl $1073741823, %edx # imm = 0x3FFFFFFF ; CHECK-NEXT: shrl $2, %eax ; CHECK-NEXT: orl $-67108864, %eax # imm = 0xFFFFFFFFFC000000 -; CHECK-NEXT: movl $1073741823, %edx # imm = 0x3FFFFFFF ; CHECK-NEXT: retl entry: %conv = sext i32 %sum to i64 Index: test/CodeGen/X86/sse2.ll =================================================================== --- test/CodeGen/X86/sse2.ll +++ test/CodeGen/X86/sse2.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; Tests for SSE2 and below, without SSE3+. ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s @@ -5,8 +6,8 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { ; CHECK-LABEL: test1: ; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movapd (%ecx), %xmm0 ; CHECK-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; CHECK-NEXT: movapd %xmm0, (%eax) @@ -21,8 +22,8 @@ define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { ; CHECK-LABEL: test2: ; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movapd (%ecx), %xmm0 ; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movapd %xmm0, (%eax) @@ -38,9 +39,9 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind { ; CHECK-LABEL: test3: ; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movaps (%edx), %xmm0 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movaps %xmm0, (%eax) @@ -75,9 +76,9 @@ ; CHECK-LABEL: test5: ; CHECK: ## BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: movl (%eax), %eax ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: retl @@ -99,8 +100,8 @@ define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind { ; CHECK-LABEL: test6: ; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movaps (%ecx), %xmm0 ; CHECK-NEXT: movaps %xmm0, (%eax) ; CHECK-NEXT: retl @@ -181,8 +182,8 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: movapd 0, %xmm0 ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; CHECK-NEXT: xorpd %xmm2, %xmm2 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: movaps %xmm0, 0 @@ -198,9 +199,9 @@ define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { ; CHECK-LABEL: test13: ; CHECK: ## BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movaps (%edx), %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -218,11 +219,11 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movaps (%ecx), %xmm1 ; CHECK-NEXT: movaps (%eax), %xmm2 +; CHECK-NEXT: movaps (%ecx), %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 -; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: subps %xmm1, %xmm2 +; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: retl %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=2] @@ -236,8 +237,8 @@ define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind { ; CHECK-LABEL: test15: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movapd (%ecx), %xmm0 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; CHECK-NEXT: retl @@ -316,9 +317,9 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retl