diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1725,11 +1725,40 @@ case ISD::ROTR: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); + bool IsROTL = (Op.getOpcode() == ISD::ROTL); // If we're rotating an 0/-1 value, then it stays an 0/-1 value. if (BitWidth == TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1)) return TLO.CombineTo(Op, Op0); + if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) { + unsigned Amt = SA->getAPIntValue().urem(BitWidth); + unsigned RevAmt = BitWidth - Amt; + + // rotl: (Op0 << Amt) | (Op0 >> (BW - Amt)) + // rotr: (Op0 << (BW - Amt)) | (Op0 >> Amt) + APInt Demanded0 = DemandedBits.rotr(IsROTL ? Amt : RevAmt); + if (SimplifyDemandedBits(Op0, Demanded0, DemandedElts, Known2, TLO, + Depth + 1)) + return true; + + // rot*(x, 0) --> x + if (Amt == 0) + return TLO.CombineTo(Op, Op0); + + // See if we don't demand either half of the rotated bits. + if ((!TLO.LegalOperations() || isOperationLegal(ISD::SHL, VT)) && + DemandedBits.countTrailingZeros() >= (IsROTL ? Amt : RevAmt)) { + Op1 = TLO.DAG.getConstant(IsROTL ? Amt : RevAmt, dl, Op1.getValueType()); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, Op1)); + } + if ((!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT)) && + DemandedBits.countLeadingZeros() >= (IsROTL ? RevAmt : Amt)) { + Op1 = TLO.DAG.getConstant(IsROTL ? RevAmt : Amt, dl, Op1.getValueType()); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1)); + } + } + // For pow-2 bitwidths we only demand the bottom modulo amt bits. if (isPowerOf2_32(BitWidth)) { APInt DemandedAmtBits(Op1.getScalarValueSizeInBits(), BitWidth - 1); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -17049,18 +17049,6 @@ const ARMSubtarget *ST) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { - // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high - // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. - SDValue N1 = N->getOperand(1); - if (ConstantSDNode *C = dyn_cast(N1)) { - SDValue N0 = N->getOperand(0); - if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && - DAG.MaskedValueIsZero(N0.getOperand(0), - APInt::getHighBitsSet(32, 16))) - return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); - } - } if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && N->getOperand(0)->getOpcode() == ISD::AND && diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -4749,6 +4749,13 @@ def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr), (STRH (REV16 GPR:$Rn), addrmode3:$addr)>; +def bswap32_zeroupper : PatFrag<(ops node:$lhs), (bswap node:$lhs),[{ + return CurDAG->MaskedValueIsZero(N->getOperand(0), + APInt::getHighBitsSet(32, 16)); +}]>; +def : ARMV6Pat<(srl (bswap32_zeroupper GPR:$Rn), (i32 16)), + (REV16 GPR:$Rn)>; + let AddedComplexity = 5 in def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm), IIC_iUNAr, "revsh", "\t$Rd, $Rm", diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -111,21 +111,21 @@ ; XOPAVX1-LABEL: rot_v4i32_mask_ashr0: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vprotd $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: rot_v4i32_mask_ashr0: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vprotd $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: rot_v4i32_mask_ashr0: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vprold $1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = ashr <4 x i32> %a0, @@ -138,24 +138,24 @@ define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) { ; XOPAVX1-LABEL: rot_v4i32_mask_ashr1: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0 -; XOPAVX1-NEXT: vprotd $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: rot_v4i32_mask_ashr1: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsrad $25, %xmm0, %xmm0 -; XOPAVX2-NEXT: vprotd $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: rot_v4i32_mask_ashr1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsrad $25, %xmm0, %xmm0 -; AVX512-NEXT: vprold $1, %xmm0, %xmm0 +; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -2075,51 +2075,15 @@ ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: splatconstant_rotate_mask_v2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vprolq $15, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VLBW-LABEL: splatconstant_rotate_mask_v2i64: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vprolq $15, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VLBW-NEXT: retq -; -; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v2i64: -; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vprolq $15, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vzeroupper -; AVX512VBMI2-NEXT: retq -; -; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v2i64: -; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vprolq $15, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: retq +; AVX512-LABEL: splatconstant_rotate_mask_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $49, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v2i64: ; XOP: # %bb.0: -; XOP-NEXT: vprotq $15, %xmm0, %xmm0 +; XOP-NEXT: vpsrlq $49, %xmm0, %xmm0 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512VLBW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefix=AVX512VBMI2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefix=AVX512VLVBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2 @@ -1799,60 +1799,24 @@ ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: splatconstant_rotate_mask_v4i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i64: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vprolq $15, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512VLBW-NEXT: retq -; -; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v4i64: -; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vprolq $15, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512VBMI2-NEXT: retq -; -; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v4i64: -; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vprolq $15, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: retq +; AVX512-LABEL: splatconstant_rotate_mask_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $49, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1 -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpsrlq $49, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shl = shl <4 x i64> %a, diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -1043,7 +1043,7 @@ define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind { ; AVX512-LABEL: splatconstant_rotate_mask_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $49, %zmm0, %zmm0 ; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq %shl = shl <8 x i64> %a,