Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -276,12 +276,28 @@ FeatureSSE1, FeatureFXSR]>; def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>; -def : Proc<"pentium-m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; -def : Proc<"pentium4", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR]>; -def : Proc<"pentium4m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; + +// Enable the PostRAScheduler for SSE2 and SSE3 class cpus. +// The intent is to enable it for pentium4 which is the current default +// processor in a vanilla 32-bit clang compilation when no specific +// architecture is specified. This generally gives a nice performance +// increase on silvermont, with largely neutral behavior on other +// contemporary large core processors. +// pentium-m, pentium4m, prescott and nocona are included as a preventative +// measure to avoid performance surprises, in case clang's default cpu +// changes slightly. + +def : ProcessorModel<"pentium-m", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; + +def : ProcessorModel<"pentium4", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR]>; + +def : ProcessorModel<"pentium4m", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; // Intel Quark. def : Proc<"lakemont", []>; @@ -292,10 +308,10 @@ FeatureFXSR, FeatureSlowBTMem]>; // NetBurst. -def : Proc<"prescott", - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureSlowBTMem]>; -def : Proc<"nocona", [ +def : ProcessorModel<"prescott", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureSlowBTMem]>; +def : ProcessorModel<"nocona", GenericPostRAModel, [ FeatureX87, FeatureSlowUAMem16, FeatureMMX, Index: llvm/trunk/lib/Target/X86/X86Schedule.td =================================================================== --- llvm/trunk/lib/Target/X86/X86Schedule.td +++ llvm/trunk/lib/Target/X86/X86Schedule.td @@ -633,8 +633,9 @@ // latencies. Since these latencies are not used for pipeline hazards, // they do not need to be exact. // -// The GenericModel contains no instruction itineraries. -def GenericModel : SchedMachineModel { +// The GenericX86Model contains no instruction itineraries +// and disables PostRAScheduler. +class GenericX86Model : SchedMachineModel { let IssueWidth = 4; let MicroOpBufferSize = 32; let LoadLatency = 4; @@ -643,6 +644,13 @@ let CompleteModel = 0; } +def GenericModel : GenericX86Model; + +// Define a model with the PostRAScheduler enabled. +def GenericPostRAModel : GenericX86Model { + let PostRAScheduler = 1; +} + include "X86ScheduleAtom.td" include "X86SchedSandyBridge.td" include "X86SchedHaswell.td" Index: llvm/trunk/test/CodeGen/X86/post-ra-sched.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/post-ra-sched.ll +++ llvm/trunk/test/CodeGen/X86/post-ra-sched.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -mtriple=i386 -mcpu=pentium4 | FileCheck %s +; RUN: llc < %s -mtriple=i386 -mcpu=pentium4m | FileCheck %s +; RUN: llc < %s -mtriple=i386 -mcpu=pentium-m | FileCheck %s +; RUN: llc < %s -mtriple=i386 -mcpu=prescott | FileCheck %s +; RUN: llc < %s -mtriple=i386 -mcpu=nocona | FileCheck %s +; +; Verify that scheduling puts some distance between a load feeding into +; the address of another load, and that second load. This currently +; happens during the post-RA-scheduler, which should be enabled by +; default with the above specified cpus. + +@ptrs = external global [0 x i32*], align 4 +@idxa = common global i32 0, align 4 +@idxb = common global i32 0, align 4 +@res = common global i32 0, align 4 + +define void @addindirect() { +; CHECK-LABEL: addindirect: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl idxb, %ecx +; CHECK-NEXT: movl idxa, %eax +; CHECK-NEXT: movl ptrs(,%ecx,4), %ecx +; CHECK-NEXT: movl ptrs(,%eax,4), %eax +; CHECK-NEXT: movl (%ecx), %ecx +; CHECK-NEXT: addl (%eax), %ecx +; CHECK-NEXT: movl %ecx, res +; CHECK-NEXT: retl +entry: + %0 = load i32, i32* @idxa, align 4 + %arrayidx = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %0 + %1 = load i32*, i32** %arrayidx, align 4 + %2 = load i32, i32* %1, align 4 + %3 = load i32, i32* @idxb, align 4 + %arrayidx1 = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %3 + %4 = load i32*, i32** %arrayidx1, align 4 + %5 = load i32, i32* %4, align 4 + %add = add i32 %5, %2 + store i32 %add, i32* @res, align 4 + ret void +}