Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -23467,6 +23467,59 @@ return SDValue(); } +static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned Size = VT.getSizeInBits(); + + // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) + // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or + // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) + // depending on sign of (SarConst - [56,48,32,24,16]) + + // sexts in X86 are MOVs. The MOVs have the same code size + // as above SHIFTs (only SHIFT on 1 has lower code size). + // However the MOVs have 2 advantages to a SHIFT: + // 1. MOVs can write to a register that differs from source + // 2. MOVs accept memory operands + + if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || + N0.getOpcode() != ISD::SHL || + N0.getOperand(1).getOpcode() != ISD::Constant) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + APInt ShlConst = (cast(N01))->getAPIntValue(); + APInt SarConst = (cast(N1))->getAPIntValue(); + EVT CVT = N1.getValueType(); + + if (SarConst.isNegative()) + return SDValue(); + + for (MVT SVT : MVT::integer_valuetypes()) { + unsigned ShiftSize = SVT.getSizeInBits(); + // skipping types without corresponding sext/zext and + // ShlConst that is not one of [56,48,32,24,16] + if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) + continue; + SDLoc DL(N); + SDValue NN = + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); + SarConst = SarConst - (Size - ShiftSize); + if (SarConst == 0) + return NN; + else if (SarConst.isNegative()) + return DAG.getNode(ISD::SHL, DL, VT, NN, + DAG.getConstant(-SarConst, DL, CVT)); + else + return DAG.getNode(ISD::SRA, DL, VT, NN, + DAG.getConstant(SarConst, DL, CVT)); + } + return SDValue(); +} + /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. @@ -23505,6 +23558,10 @@ if (SDValue V = PerformSHLCombine(N, DAG)) return V; + if (N->getOpcode() == ISD::SRA) + if (SDValue V = PerformSRACombine(N, DAG)) + return V; + // Try to fold this logical shift into a zero vector. if (N->getOpcode() != ISD::SRA) if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) Index: test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll =================================================================== --- test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll +++ test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll @@ -4,15 +4,23 @@ ; a shr (X, -8) that gets subsequently "optimized away" as undef ; PR4254 +; after fixing PR24373 +; shlq $56, %rdi +; sarq $48, %rdi +; folds into +; movsbq %dil, %rax +; shlq $8, %rax +; which is better for x86 + target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" define i64 @foo(i64 %b) nounwind readnone { entry: ; CHECK-LABEL: foo: -; CHECK: shlq $56, %rdi -; CHECK: sarq $48, %rdi -; CHECK: leaq 1(%rdi), %rax +; CHECK: movsbq %dil, %rax +; CHECK: shlq $8, %rax +; CHECK: orq $1, %rax %shl = shl i64 %b, 56 ; [#uses=1] %shr = ashr i64 %shl, 48 ; [#uses=1] %add5 = or i64 %shr, 1 ; [#uses=1] Index: test/CodeGen/X86/sar_fold.ll =================================================================== --- test/CodeGen/X86/sar_fold.ll +++ test/CodeGen/X86/sar_fold.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -O2 -march=x86 | FileCheck %s + +define i32 @shl16sar15(i32 %a) #0 { + %1 = shl i32 %a, 16 + %2 = ashr exact i32 %1, 15 + ret i32 %2 +} +; CHECK: shl16sar15 +; CHECK: movswl + +define i32 @shl16sar17(i32 %a) #0 { + %1 = shl i32 %a, 16 + %2 = ashr exact i32 %1, 17 + ret i32 %2 +} +; CHECK: shl16sar17 +; CHECK: movswl + +define i32 @shl24sar23(i32 %a) #0 { + %1 = shl i32 %a, 24 + %2 = ashr exact i32 %1, 23 + ret i32 %2 +} +; CHECK: shl24sar23 +; CHECK: movsbl + +define i32 @shl24sar25(i32 %a) #0 { + %1 = shl i32 %a, 24 + %2 = ashr exact i32 %1, 25 + ret i32 %2 +} +; CHECK: shl24sar25 +; CHECK: movsbl Index: test/CodeGen/X86/sar_fold64.ll =================================================================== --- test/CodeGen/X86/sar_fold64.ll +++ test/CodeGen/X86/sar_fold64.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -O2 -march=x86-64 | FileCheck %s + +define i32 @shl48sar47(i64 %a) #0 { + %1 = shl i64 %a, 48 + %2 = ashr exact i64 %1, 47 + %3 = trunc i64 %2 to i32 + ret i32 %3 +} +; CHECK: shl48sar47 +; CHECK: movswq + +define i32 @shl48sar49(i64 %a) #0 { + %1 = shl i64 %a, 48 + %2 = ashr exact i64 %1, 49 + %3 = trunc i64 %2 to i32 + ret i32 %3 +} +; CHECK: shl48sar49 +; CHECK: movswq + +define i32 @shl56sar55(i64 %a) #0 { + %1 = shl i64 %a, 56 + %2 = ashr exact i64 %1, 55 + %3 = trunc i64 %2 to i32 + ret i32 %3 +} +; CHECK: shl56sar55 +; CHECK: movsbq + +define i32 @shl56sar57(i64 %a) #0 { + %1 = shl i64 %a, 56 + %2 = ashr exact i64 %1, 57 + %3 = trunc i64 %2 to i32 + ret i32 %3 +} +; CHECK: shl56sar57 +; CHECK: movsbq