diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -828,6 +828,24 @@ return N->getOpcode() == ISD::FDIV; } + // Given: + // (icmp eq/ne (and X, C0), (shift X, C1)) + // or + // (icmp eq/ne X, (rotate X, CPow2)) + + // If C0 is a mask or shifted mask and the shift amt (C1) isolates the + // remaining bits (i.e something like `(x64 & UINT32_MAX) == (x64 >> 32)`) + // Do we prefer the shift to be shift-right, shift-left, or rotate. + // Note: Its only valid to convert the rotate version to the shift version iff + // the shift-amt (`C1`) is a power of 2 (including 0). + // If ShiftOpc (current Opcode) is returned, do nothing. + virtual unsigned preferedOpcodeForCmpEqPiecesOfOperand( + EVT VT, unsigned ShiftOpc, bool MayTransformRotate, + const APInt &ShiftOrRotateAmt, + const std::optional &AndMask) const { + return ShiftOpc; + } + /// These two forms are equivalent: /// sub %y, (xor %x, -1) /// add (add %x, 1), %y diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12409,27 +12409,127 @@ ISD::CondCode Cond = cast(N->getOperand(2))->get(); EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); - SDValue Combined = SimplifySetCC(VT, N->getOperand(0), N->getOperand(1), Cond, - SDLoc(N), !PreferSetCC); - - if (!Combined) - return SDValue(); + SDValue Combined = SimplifySetCC(VT, N0, N1, Cond, SDLoc(N), !PreferSetCC); - // If we prefer to have a setcc, and we don't, we'll try our best to - // recreate one using rebuildSetCC. - if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) { - SDValue NewSetCC = rebuildSetCC(Combined); + if (Combined) { + // If we prefer to have a setcc, and we don't, we'll try our best to + // recreate one using rebuildSetCC. + if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) { + SDValue NewSetCC = rebuildSetCC(Combined); - // We don't have anything interesting to combine to. - if (NewSetCC.getNode() == N) - return SDValue(); + // We don't have anything interesting to combine to. + if (NewSetCC.getNode() == N) + return SDValue(); - if (NewSetCC) - return NewSetCC; + if (NewSetCC) + return NewSetCC; + } + return Combined; } - return Combined; + // Optimize + // 1) (icmp eq/ne (and X, C0), (shift X, C1)) + // or + // 2) (icmp eq/ne X, (rotate X, C1)) + // If C0 is a mask or shifted mask and the shift amt (C1) isolates the + // remaining bits (i.e something like `(x64 & UINT32_MAX) == (x64 >> 32)`) + // Then: + // If C1 is a power of 2, then the rotate and shift+and versions are + // equivilent, so we can interchange them depending on target preference. + // Otherwise, if we have the shift+and version we can interchange srl/shl + // which inturn affects the constant C0. We can use this to get better + // constants again determined by target preference. + if (Cond == ISD::SETNE || Cond == ISD::SETEQ) { + auto IsAndWithShift = [](SDValue A, SDValue B) { + return A.getOpcode() == ISD::AND && + (B.getOpcode() == ISD::SRL || B.getOpcode() == ISD::SHL) && + A.getOperand(0) == B.getOperand(0); + }; + auto IsRotateWithOp = [](SDValue A, SDValue B) { + return (B.getOpcode() == ISD::ROTL || B.getOpcode() == ISD::ROTR) && + B.getOperand(0) == A; + }; + SDValue AndOrOp = SDValue(), ShiftOrRotate = SDValue(); + bool IsRotate = false; + + // Find either shift+and or rotate pattern. + if (IsAndWithShift(N0, N1)) { + AndOrOp = N0; + ShiftOrRotate = N1; + } else if (IsAndWithShift(N1, N0)) { + AndOrOp = N1; + ShiftOrRotate = N0; + } else if (IsRotateWithOp(N0, N1)) { + IsRotate = true; + AndOrOp = N0; + ShiftOrRotate = N1; + } else if (IsRotateWithOp(N1, N0)) { + IsRotate = true; + AndOrOp = N1; + ShiftOrRotate = N0; + } + + if (AndOrOp && ShiftOrRotate && ShiftOrRotate.hasOneUse() && + (IsRotate || AndOrOp.hasOneUse())) { + EVT OpVT = N0.getValueType(); + // Get constant shift/rotate amount and possibly mask (if its shift+and + // variant). + auto GetAPIntValue = [](SDValue Op) -> std::optional { + ConstantSDNode *CNode = isConstOrConstSplat(Op, /*AllowUndefs*/ false, + /*AllowTrunc*/ false); + if (CNode == nullptr) + return std::nullopt; + return CNode->getAPIntValue(); + }; + std::optional AndCMask = + IsRotate ? std::nullopt : GetAPIntValue(AndOrOp.getOperand(1)); + std::optional ShiftCAmt = + GetAPIntValue(ShiftOrRotate.getOperand(1)); + unsigned NumBits = OpVT.getScalarSizeInBits(); + + // We found constants. + if (ShiftCAmt && (IsRotate || AndCMask) && ShiftCAmt->ult(NumBits)) { + unsigned ShiftOpc = ShiftOrRotate.getOpcode(); + // Check that the constants meet the constraints. + bool CanTransform = + IsRotate || + (*ShiftCAmt == (~*AndCMask).popcount() && ShiftOpc == ISD::SHL + ? (~*AndCMask).isMask() + : AndCMask->isMask()); + + // See if target prefers another shift/rotate opcode. + unsigned NewShiftOpc = TLI.preferedOpcodeForCmpEqPiecesOfOperand( + OpVT, ShiftOpc, ShiftCAmt->isPowerOf2(), *ShiftCAmt, AndCMask); + // Transform is valid and we have a new preference. + if (CanTransform && NewShiftOpc != ShiftOpc) { + SDLoc DL(N); + SDValue NewShiftOrRotate = + DAG.getNode(NewShiftOpc, DL, OpVT, ShiftOrRotate.getOperand(0), + ShiftOrRotate.getOperand(1)); + SDValue NewAndOrOp = SDValue(); + + if (NewShiftOpc == ISD::SHL || NewShiftOpc == ISD::SRL) { + APInt NewMask = + NewShiftOpc == ISD::SHL + ? APInt::getHighBitsSet(NumBits, + NumBits - ShiftCAmt->getZExtValue()) + : APInt::getLowBitsSet(NumBits, + NumBits - ShiftCAmt->getZExtValue()); + NewAndOrOp = + DAG.getNode(ISD::AND, DL, OpVT, ShiftOrRotate.getOperand(0), + DAG.getConstant(NewMask, DL, OpVT)); + } else { + NewAndOrOp = ShiftOrRotate.getOperand(0); + } + + return DAG.getSetCC(DL, VT, NewAndOrOp, NewShiftOrRotate, Cond); + } + } + } + } + return SDValue(); } SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1138,6 +1138,11 @@ unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override; + unsigned preferedOpcodeForCmpEqPiecesOfOperand( + EVT VT, unsigned ShiftOpc, bool MayTransformRotate, + const APInt &ShiftOrRotateAmt, + const std::optional &AndMask) const override; + bool preferScalarizeSplat(SDNode *N) const override; bool shouldFoldConstantShiftPairToMask(const SDNode *N, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3257,6 +3257,72 @@ return NewShiftOpcode == ISD::SHL; } +unsigned X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand( + EVT VT, unsigned ShiftOpc, bool MayTransformRotate, + const APInt &ShiftOrRotateAmt, const std::optional &AndMask) const { + if (!VT.isInteger()) + return ShiftOpc; + + bool PreferRotate = false; + if (VT.isVector()) { + // For vectors, if we have rotate instruction support, then its definetly + // best. Otherwise its not clear what the best so just don't make changed. + PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 || + VT.getScalarType() == MVT::i64); + } else { + // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer + // rotate unless we have a zext mask+shr. + PreferRotate = Subtarget.hasBMI2(); + if (!PreferRotate) { + unsigned MaskBits = + VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue(); + PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32); + } + } + + if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) { + assert(AndMask.has_value() && "Null andmask when querying about shift+and"); + + if (PreferRotate && MayTransformRotate) + return ISD::ROTL; + + // If vector we don't really get much benefit swapping around constants. + // Maybe we could check if the DAG has the flipped node already in the + // future. + if (VT.isVector()) + return ShiftOpc; + + // See if the beneficial to swap shift type. + if (ShiftOpc == ISD::SHL) { + // If the current setup has imm64 mask, then inverse will have + // at least imm32 mask (or be zext i32 -> i64). + if (VT == MVT::i64) + return AndMask->getSignificantBits() > 32 ? ISD::SRL : ShiftOpc; + + // We can only benefit if req at least 7-bit for the mask. We + // don't want to replace shl of 1,2,3 as they can be implemented + // with lea/add. + return ShiftOrRotateAmt.uge(7) ? ISD::SRL : ShiftOpc; + } + + if (VT == MVT::i64) + // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is + // extremely efficient. + return AndMask->getSignificantBits() > 33 ? ISD::SHL : ShiftOpc; + + // Keep small shifts as shl so we can generate add/lea. + return ShiftOrRotateAmt.ult(7) ? ISD::SHL : ShiftOpc; + } + + // We prefer rotate for vectors of if we won't get a zext mask with SRL + // (PreferRotate will be set in the latter case). + if (PreferRotate || VT.isVector()) + return ShiftOpc; + + // Non-vector type and we have a zext mask with SRL. + return ISD::SRL; +} + bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const { return N->getOpcode() != ISD::FP_EXTEND; } diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll --- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll +++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll @@ -20,9 +20,8 @@ ; CHECK-LABEL: shr_to_shl_eq_i8_s2: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: andb $63, %al -; CHECK-NEXT: shrb $2, %dil -; CHECK-NEXT: cmpb %dil, %al +; CHECK-NEXT: rolb $2, %al +; CHECK-NEXT: cmpb %al, %dil ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %and = and i8 %x, 63 @@ -35,9 +34,9 @@ ; CHECK-LABEL: shl_to_shr_ne_i8_s7: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shlb $7, %al -; CHECK-NEXT: andb $-128, %dil -; CHECK-NEXT: cmpb %dil, %al +; CHECK-NEXT: shrb $7, %al +; CHECK-NEXT: andb $1, %dil +; CHECK-NEXT: cmpb %al, %dil ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %shl = shl i8 %x, 7 @@ -63,9 +62,8 @@ ; CHECK-LABEL: shr_to_shl_eq_i8_s1: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: andb $127, %al -; CHECK-NEXT: shrb %dil -; CHECK-NEXT: cmpb %dil, %al +; CHECK-NEXT: rolb %al +; CHECK-NEXT: cmpb %al, %dil ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %and = and i8 %x, 127 @@ -77,10 +75,10 @@ define i1 @shr_to_shl_eq_i32_s3(i32 %x) { ; CHECK-LABEL: shr_to_shl_eq_i32_s3: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: andl $536870911, %eax # imm = 0x1FFFFFFF -; CHECK-NEXT: shrl $3, %edi -; CHECK-NEXT: cmpl %edi, %eax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal (,%rdi,8), %eax +; CHECK-NEXT: andl $-8, %edi +; CHECK-NEXT: cmpl %eax, %edi ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %and = and i32 %x, 536870911 @@ -105,14 +103,20 @@ } define i1 @shl_to_shr_ne_i32_s16(i32 %x) { -; CHECK-LABEL: shl_to_shr_ne_i32_s16: -; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: andl $-65536, %edi # imm = 0xFFFF0000 -; CHECK-NEXT: cmpl %edi, %eax -; CHECK-NEXT: setne %al -; CHECK-NEXT: retq +; CHECK-NOBMI-LABEL: shl_to_shr_ne_i32_s16: +; CHECK-NOBMI: # %bb.0: +; CHECK-NOBMI-NEXT: movzwl %di, %eax +; CHECK-NOBMI-NEXT: shrl $16, %edi +; CHECK-NOBMI-NEXT: cmpl %edi, %eax +; CHECK-NOBMI-NEXT: setne %al +; CHECK-NOBMI-NEXT: retq +; +; CHECK-BMI2-LABEL: shl_to_shr_ne_i32_s16: +; CHECK-BMI2: # %bb.0: +; CHECK-BMI2-NEXT: rorxl $16, %edi, %eax +; CHECK-BMI2-NEXT: cmpl %eax, %edi +; CHECK-BMI2-NEXT: setne %al +; CHECK-BMI2-NEXT: retq %shl = shl i32 %x, 16 %and = and i32 %x, 4294901760 %r = icmp ne i32 %shl, %and @@ -137,9 +141,8 @@ define i1 @shr_to_shl_eq_i16_s1(i16 %x) { ; CHECK-LABEL: shr_to_shl_eq_i16_s1: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl %di, %eax -; CHECK-NEXT: andl $32767, %edi # imm = 0x7FFF -; CHECK-NEXT: shrl %eax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: rolw %ax ; CHECK-NEXT: cmpw %ax, %di ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq @@ -167,9 +170,9 @@ define i1 @shl_to_shr_eq_i64_s44(i64 %x) { ; CHECK-LABEL: shl_to_shr_eq_i64_s44: ; CHECK: # %bb.0: -; CHECK-NEXT: movabsq $-17592186044416, %rax # imm = 0xFFFFF00000000000 -; CHECK-NEXT: andq %rdi, %rax -; CHECK-NEXT: shlq $44, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq $44, %rax +; CHECK-NEXT: andl $1048575, %edi # imm = 0xFFFFF ; CHECK-NEXT: cmpq %rax, %rdi ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq @@ -180,13 +183,20 @@ } define i1 @shr_to_shl_ne_i64_s32(i64 %x) { -; CHECK-LABEL: shr_to_shl_ne_i64_s32: -; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shrq $32, %rdi -; CHECK-NEXT: cmpq %rdi, %rax -; CHECK-NEXT: setne %al -; CHECK-NEXT: retq +; CHECK-NOBMI-LABEL: shr_to_shl_ne_i64_s32: +; CHECK-NOBMI: # %bb.0: +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: shrq $32, %rdi +; CHECK-NOBMI-NEXT: cmpq %rdi, %rax +; CHECK-NOBMI-NEXT: setne %al +; CHECK-NOBMI-NEXT: retq +; +; CHECK-BMI2-LABEL: shr_to_shl_ne_i64_s32: +; CHECK-BMI2: # %bb.0: +; CHECK-BMI2-NEXT: rorxq $32, %rdi, %rax +; CHECK-BMI2-NEXT: cmpq %rax, %rdi +; CHECK-BMI2-NEXT: setne %al +; CHECK-BMI2-NEXT: retq %and = and i64 %x, 4294967295 %shr = lshr i64 %x, 32 %r = icmp ne i64 %and, %shr @@ -230,9 +240,9 @@ define i1 @shl_to_shr_eq_i64_s63(i64 %x) { ; CHECK-LABEL: shl_to_shr_eq_i64_s63: ; CHECK: # %bb.0: -; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; CHECK-NEXT: andq %rdi, %rax -; CHECK-NEXT: shlq $63, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq $63, %rax +; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: cmpq %rax, %rdi ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq @@ -258,23 +268,14 @@ } define i1 @shr_to_shl_eq_i64_s7(i64 %x) { -; CHECK-NOBMI-LABEL: shr_to_shl_eq_i64_s7: -; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movabsq $144115188075855871, %rax # imm = 0x1FFFFFFFFFFFFFF -; CHECK-NOBMI-NEXT: andq %rdi, %rax -; CHECK-NOBMI-NEXT: shrq $7, %rdi -; CHECK-NOBMI-NEXT: cmpq %rdi, %rax -; CHECK-NOBMI-NEXT: sete %al -; CHECK-NOBMI-NEXT: retq -; -; CHECK-BMI2-LABEL: shr_to_shl_eq_i64_s7: -; CHECK-BMI2: # %bb.0: -; CHECK-BMI2-NEXT: movb $57, %al -; CHECK-BMI2-NEXT: bzhiq %rax, %rdi, %rax -; CHECK-BMI2-NEXT: shrq $7, %rdi -; CHECK-BMI2-NEXT: cmpq %rdi, %rax -; CHECK-BMI2-NEXT: sete %al -; CHECK-BMI2-NEXT: retq +; CHECK-LABEL: shr_to_shl_eq_i64_s7: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shlq $7, %rax +; CHECK-NEXT: andq $-128, %rdi +; CHECK-NEXT: cmpq %rax, %rdi +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %and = and i64 %x, 144115188075855871 %shr = lshr i64 %x, 7 %r = icmp eq i64 %and, %shr @@ -284,9 +285,8 @@ define i1 @shl_to_shr_ne_i32_s24(i32 %x) { ; CHECK-LABEL: shl_to_shr_ne_i32_s24: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shll $24, %eax -; CHECK-NEXT: andl $-16777216, %edi # imm = 0xFF000000 +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: shrl $24, %edi ; CHECK-NEXT: cmpl %edi, %eax ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq @@ -312,14 +312,20 @@ } define i1 @shr_to_shl_ne_i32_s8(i32 %x) { -; CHECK-LABEL: shr_to_shl_ne_i32_s8: -; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: andl $16777215, %eax # imm = 0xFFFFFF -; CHECK-NEXT: shrl $8, %edi -; CHECK-NEXT: cmpl %edi, %eax -; CHECK-NEXT: setne %al -; CHECK-NEXT: retq +; CHECK-NOBMI-LABEL: shr_to_shl_ne_i32_s8: +; CHECK-NOBMI: # %bb.0: +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: roll $8, %eax +; CHECK-NOBMI-NEXT: cmpl %eax, %edi +; CHECK-NOBMI-NEXT: setne %al +; CHECK-NOBMI-NEXT: retq +; +; CHECK-BMI2-LABEL: shr_to_shl_ne_i32_s8: +; CHECK-BMI2: # %bb.0: +; CHECK-BMI2-NEXT: rorxl $24, %edi, %eax +; CHECK-BMI2-NEXT: cmpl %eax, %edi +; CHECK-BMI2-NEXT: setne %al +; CHECK-BMI2-NEXT: retq %and = and i32 %x, 16777215 %shr = lshr i32 %x, 8 %r = icmp ne i32 %and, %shr @@ -359,9 +365,8 @@ ; ; CHECK-AVX512-LABEL: shr_to_ror_eq_4xi32_s4: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpsrld $4, %xmm0, %xmm1 -; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; CHECK-AVX512-NEXT: vprold $4, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: retq %shr = lshr <4 x i32> %x, @@ -402,9 +407,8 @@ ; ; CHECK-AVX512-LABEL: shl_to_ror_eq_4xi32_s8: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpslld $8, %xmm0, %xmm1 -; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; CHECK-AVX512-NEXT: vprold $8, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: retq %shr = shl <4 x i32> %x,