Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -263,6 +263,15 @@ "fast-lzcnt", "HasFastLZCNT", "true", "LZCNT instructions are as fast as most simple integer ops">; + +// Sandy Bridge and newer processors can use SHLD with the same source on both +// inputs to implement rotate to avoid the partial flag update of the normal +// rotate instructions. +def FeatureFastSHLDRotate + : SubtargetFeature< + "fast-shld-rotate", "HasFastSHLDRotate", "true", + "SHLD can be used as a faster rotate">; + //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// @@ -458,7 +467,8 @@ FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, - FeatureFastScalarFSQRT + FeatureFastScalarFSQRT, + FeatureFastSHLDRotate ]>; class SandyBridgeProc : ProcModelslowIncDec()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; +def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/X86/X86InstrShiftRotate.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrShiftRotate.td +++ llvm/trunk/lib/Target/X86/X86InstrShiftRotate.td @@ -846,6 +846,15 @@ } // Defs = [EFLAGS] +// Sandy Bridge and newer Intel processors support faster rotates using +// SHLD to avoid a partial flag update on the normal rotate instructions. +let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in { + def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), + (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>; + def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), + (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>; +} + def ROT32L2R_imm8 : SDNodeXFormgetZExtValue(), SDLoc(N)); Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -229,6 +229,9 @@ /// True if LZCNT instruction is fast. bool HasFastLZCNT; + /// True if SHLD based rotate is fast. + bool HasFastSHLDRotate; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -466,6 +469,7 @@ bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } + bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } Index: llvm/trunk/lib/Target/X86/X86Subtarget.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp @@ -302,6 +302,7 @@ HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; HasFastLZCNT = false; + HasFastSHLDRotate = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; Index: llvm/trunk/test/CodeGen/X86/rot32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/rot32.ll +++ llvm/trunk/test/CodeGen/X86/rot32.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD ; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2 define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone { @@ -49,6 +50,8 @@ entry: ; CHECK-LABEL: xfoo: ; CHECK: roll $7 +; SHLD-LABEL: xfoo: +; SHLD: shldl $7 ; BMI2-LABEL: xfoo: ; BMI2: rorxl $25 %0 = lshr i32 %x, 25 @@ -61,6 +64,8 @@ entry: ; CHECK-LABEL: xfoop: ; CHECK: roll $7 +; SHLD-LABEL: xfoop: +; SHLD: shldl $7 ; BMI2-LABEL: xfoop: ; BMI2: rorxl $25 %x = load i32, i32* %p @@ -84,6 +89,8 @@ entry: ; CHECK-LABEL: xun: ; CHECK: roll $25 +; SHLD-LABEL: xun: +; SHLD: shldl $25 ; BMI2-LABEL: xun: ; BMI2: rorxl $7 %0 = lshr i32 %x, 7 @@ -96,6 +103,8 @@ entry: ; CHECK-LABEL: xunp: ; CHECK: roll $25 +; shld-label: xunp: +; shld: shldl $25 ; BMI2-LABEL: xunp: ; BMI2: rorxl $7 %x = load i32, i32* %p Index: llvm/trunk/test/CodeGen/X86/rot64.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/rot64.ll +++ llvm/trunk/test/CodeGen/X86/rot64.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD ; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2 define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone { @@ -49,6 +50,8 @@ entry: ; CHECK-LABEL: xfoo: ; CHECK: rolq $7 +; SHLD-LABEL: xfoo: +; SHLD: shldq $7 ; BMI2-LABEL: xfoo: ; BMI2: rorxq $57 %0 = lshr i64 %x, 57 @@ -61,6 +64,8 @@ entry: ; CHECK-LABEL: xfoop: ; CHECK: rolq $7 +; SHLD-LABEL: xfoop: +; SHLD: shldq $7 ; BMI2-LABEL: xfoop: ; BMI2: rorxq $57 %x = load i64, i64* %p @@ -84,6 +89,8 @@ entry: ; CHECK-LABEL: xun: ; CHECK: rolq $57 +; SHLD-LABEL: xun: +; SHLD: shldq $57 ; BMI2-LABEL: xun: ; BMI2: rorxq $7 %0 = lshr i64 %x, 7 @@ -96,6 +103,8 @@ entry: ; CHECK-LABEL: xunp: ; CHECK: rolq $57 +; SHLD-LABEL: xunp: +; SHLD: shldq $57 ; BMI2-LABEL: xunp: ; BMI2: rorxq $7 %x = load i64, i64* %p