Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7127,13 +7127,52 @@ N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1))) return IsFSHL ? N0 : N1; - // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth) + auto IsUndefOrZero = [](SDValue V) { + if (V.isUndef()) + return true; + if (ConstantSDNode *Cst = isConstOrConstSplat(V, /*AllowUndefs*/true)) + return Cst->getAPIntValue() == 0; + return false; + }; + if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) { + EVT ShAmtTy = N2.getValueType(); + + // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth) if (Cst->getAPIntValue().uge(BitWidth)) { uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth); return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1, - DAG.getConstant(RotAmt, SDLoc(N), N2.getValueType())); + DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy)); } + + unsigned ShAmt = Cst->getZExtValue(); + if (ShAmt == 0) + return IsFSHL ? N0 : N1; + + // fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C) + // fold fshr(undef_or_zero, N1, C) -> lshr(N1, C) + // fold fshl(N0, undef_or_zero, C) -> shl(N0, C) + // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C) + if (IsUndefOrZero(N0)) + return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, + DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt, + SDLoc(N), ShAmtTy)); + if (IsUndefOrZero(N1)) + return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, + DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt, + SDLoc(N), ShAmtTy)); + } + + // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2) + // fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2) + // iff We know the shift amount is in range. + // TODO: when is it worth doing SUB(BW, N2) as well? + if (isPowerOf2_32(BitWidth)) { + APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1); + if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) + return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2); + if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits)) + return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2); } // fold (fshl N0, N0, N2) -> (rotl N0, N2) Index: llvm/trunk/test/CodeGen/X86/funnel-shift.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/funnel-shift.ll +++ llvm/trunk/test/CodeGen/X86/funnel-shift.ll @@ -404,12 +404,13 @@ ; X32-SSE2-LABEL: fshl_i32_undef0_cst: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: shldl $9, %eax, %eax +; X32-SSE2-NEXT: shrl $23, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i32_undef0_cst: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: shldl $9, %edi, %eax +; X64-AVX2-NEXT: movl %edi, %eax +; X64-AVX2-NEXT: shrl $23, %eax ; X64-AVX2-NEXT: retq %res = call i32 @llvm.fshl.i32(i32 undef, i32 %a0, i32 9) ret i32 %res @@ -438,19 +439,18 @@ ; X32-SSE2-LABEL: fshl_i32_undef1_msk: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $7, %ecx -; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx -; X32-SSE2-NEXT: shldl %cl, %eax, %eax +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl +; X32-SSE2-NEXT: andb $7, %cl +; X32-SSE2-NEXT: shll %cl, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i32_undef1_msk: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %esi, %ecx ; X64-AVX2-NEXT: movl %edi, %eax -; X64-AVX2-NEXT: andl $7, %ecx +; X64-AVX2-NEXT: andb $7, %cl ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shldl %cl, %eax, %eax +; X64-AVX2-NEXT: shll %cl, %eax ; X64-AVX2-NEXT: retq %m = and i32 %a1, 7 %res = call i32 @llvm.fshl.i32(i32 %a0, i32 undef, i32 %m) @@ -461,13 +461,13 @@ ; X32-SSE2-LABEL: fshl_i32_undef1_cst: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: shldl $9, %eax, %eax +; X32-SSE2-NEXT: shll $9, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i32_undef1_cst: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %edi, %eax -; X64-AVX2-NEXT: shldl $9, %eax, %eax +; X64-AVX2-NEXT: shll $9, %eax ; X64-AVX2-NEXT: retq %res = call i32 @llvm.fshl.i32(i32 %a0, i32 undef, i32 9) ret i32 %res @@ -513,19 +513,18 @@ ; X32-SSE2-LABEL: fshr_i32_undef0_msk: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: andl $7, %ecx -; X32-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx -; X32-SSE2-NEXT: shrdl %cl, %eax, %eax +; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl +; X32-SSE2-NEXT: andb $7, %cl +; X32-SSE2-NEXT: shrl %cl, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_undef0_msk: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %esi, %ecx ; X64-AVX2-NEXT: movl %edi, %eax -; X64-AVX2-NEXT: andl $7, %ecx +; X64-AVX2-NEXT: andb $7, %cl ; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrdl %cl, %eax, %eax +; X64-AVX2-NEXT: shrl %cl, %eax ; X64-AVX2-NEXT: retq %m = and i32 %a1, 7 %res = call i32 @llvm.fshr.i32(i32 undef, i32 %a0, i32 %m) @@ -536,13 +535,13 @@ ; X32-SSE2-LABEL: fshr_i32_undef0_cst: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: shrdl $9, %eax, %eax +; X32-SSE2-NEXT: shrl $9, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_undef0_cst: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: movl %edi, %eax -; X64-AVX2-NEXT: shrdl $9, %eax, %eax +; X64-AVX2-NEXT: shrl $9, %eax ; X64-AVX2-NEXT: retq %res = call i32 @llvm.fshr.i32(i32 undef, i32 %a0, i32 9) ret i32 %res @@ -592,12 +591,13 @@ ; X32-SSE2-LABEL: fshr_i32_undef1_cst: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: shrdl $9, %eax, %eax +; X32-SSE2-NEXT: shll $23, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_undef1_cst: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: shrdl $9, %edi, %eax +; X64-AVX2-NEXT: movl %edi, %eax +; X64-AVX2-NEXT: shll $23, %eax ; X64-AVX2-NEXT: retq %res = call i32 @llvm.fshr.i32(i32 %a0, i32 undef, i32 9) ret i32 %res @@ -645,15 +645,14 @@ define i32 @fshl_i32_zero0_cst(i32 %a0) nounwind { ; X32-SSE2-LABEL: fshl_i32_zero0_cst: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: xorl %eax, %eax -; X32-SSE2-NEXT: shldl $9, %ecx, %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: shrl $23, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i32_zero0_cst: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: shldl $9, %edi, %eax +; X64-AVX2-NEXT: movl %edi, %eax +; X64-AVX2-NEXT: shrl $23, %eax ; X64-AVX2-NEXT: retq %res = call i32 @llvm.fshl.i32(i32 0, i32 %a0, i32 9) ret i32 %res @@ -683,15 +682,14 @@ define i32 @fshl_i32_zero1_cst(i32 %a0) nounwind { ; X32-SSE2-LABEL: fshl_i32_zero1_cst: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: xorl %eax, %eax -; X32-SSE2-NEXT: shrdl $23, %ecx, %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: shll $9, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshl_i32_zero1_cst: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: shrdl $23, %edi, %eax +; X64-AVX2-NEXT: movl %edi, %eax +; X64-AVX2-NEXT: shll $9, %eax ; X64-AVX2-NEXT: retq %res = call i32 @llvm.fshl.i32(i32 %a0, i32 0, i32 9) ret i32 %res @@ -721,15 +719,14 @@ define i32 @fshr_i32_zero0_cst(i32 %a0) nounwind { ; X32-SSE2-LABEL: fshr_i32_zero0_cst: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: xorl %eax, %eax -; X32-SSE2-NEXT: shldl $23, %ecx, %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: shrl $9, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_zero0_cst: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: shldl $23, %edi, %eax +; X64-AVX2-NEXT: movl %edi, %eax +; X64-AVX2-NEXT: shrl $9, %eax ; X64-AVX2-NEXT: retq %res = call i32 @llvm.fshr.i32(i32 0, i32 %a0, i32 9) ret i32 %res @@ -758,15 +755,14 @@ define i32 @fshr_i32_zero1_cst(i32 %a0) nounwind { ; X32-SSE2-LABEL: fshr_i32_zero1_cst: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: xorl %eax, %eax -; X32-SSE2-NEXT: shrdl $9, %ecx, %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: shll $23, %eax ; X32-SSE2-NEXT: retl ; ; X64-AVX2-LABEL: fshr_i32_zero1_cst: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: shrdl $9, %edi, %eax +; X64-AVX2-NEXT: movl %edi, %eax +; X64-AVX2-NEXT: shll $23, %eax ; X64-AVX2-NEXT: retq %res = call i32 @llvm.fshr.i32(i32 %a0, i32 0, i32 9) ret i32 %res