Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1335,6 +1335,13 @@ setOperationAction(ISD::CTTZ, VT, Custom); } + // NonVLX sub-targets extend 128/256 vectors to use the 512 version. + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64, + MVT::v8i64}) { + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); + } + // Need to promote to 64-bit even though we have 32-bit masked instructions // because the IR optimizers rearrange bitcasts around logic ops leaving // too many variations to handle if we don't promote them. @@ -22665,10 +22672,27 @@ SDLoc DL(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); + unsigned Opcode = Op.getOpcode(); + + if (Subtarget.hasAVX512()) { + // Attempt to rotate by immediate. + if (auto *BVAmt = dyn_cast(Amt)) { + if (auto *RotateConst = BVAmt->getConstantSplatNode()) { + unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); + uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); + assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); + return DAG.getNode(Op, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); + } + } + + // Else, fall-back on VPROLV/VPRORV. + return Op; + } assert(VT.isVector() && "Custom lowering only for vector rotates!"); assert(Subtarget.hasXOP() && "XOP support required for vector rotates!"); - assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); + assert((Opcode == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right. @@ -24030,7 +24054,8 @@ case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); - case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); + case ISD::ROTL: + case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -5676,6 +5676,109 @@ defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>; defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>; + +// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; +} + +// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; +} + //===-------------------------------------------------------------------===// // 1-src variable permutation VPERMW/D/Q //===-------------------------------------------------------------------===// Index: test/CodeGen/X86/combine-rotates.ll =================================================================== --- test/CodeGen/X86/combine-rotates.ll +++ test/CodeGen/X86/combine-rotates.ll @@ -40,12 +40,7 @@ ; ; AVX512-LABEL: combine_vec_rot_rot_splat: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrld $3, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $29, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpsrld $22, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $10, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vprold $7, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = shl <4 x i32> %x, @@ -63,12 +58,6 @@ ; ; AVX512-LABEL: combine_vec_rot_rot_splat_zero: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrld $1, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpsrld $31, %xmm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = shl <4 x i32> %x, Index: test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- test/CodeGen/X86/vector-rotate-128.ll +++ test/CodeGen/X86/vector-rotate-128.ll @@ -77,14 +77,19 @@ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] -; AVX512-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: var_rotate_v2i64: ; XOP: # BB#0: @@ -214,14 +219,19 @@ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [32,32,32,32] -; AVX512-NEXT: vpsubd %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlvd %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: var_rotate_v4i32: ; XOP: # BB#0: @@ -1371,12 +1381,18 @@ ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllq $14, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlq $50, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $14, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v2i64: ; XOP: # BB#0: @@ -1412,12 +1428,18 @@ ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $28, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v4i32: ; XOP: # BB#0: @@ -1544,11 +1566,19 @@ ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v2i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsrlq $49, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $15, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v2i64: ; XOP: # BB#0: @@ -1595,14 +1625,19 @@ ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %xmm0, %xmm1 -; AVX512-NEXT: vpsrld $28, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v4i32: ; XOP: # BB#0: Index: test/CodeGen/X86/vector-rotate-256.ll =================================================================== --- test/CodeGen/X86/vector-rotate-256.ll +++ test/CodeGen/X86/vector-rotate-256.ll @@ -48,14 +48,18 @@ ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64] -; AVX512-NEXT: vpsubq %ymm1, %ymm2, %ymm2 -; AVX512-NEXT: vpsllvq %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: var_rotate_v4i64: ; XOPAVX1: # BB#0: @@ -135,14 +139,18 @@ ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: var_rotate_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32] -; AVX512-NEXT: vpsubd %ymm1, %ymm2, %ymm2 -; AVX512-NEXT: vpsllvd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: var_rotate_v8i32: ; XOPAVX1: # BB#0: @@ -825,12 +833,17 @@ ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsllq $14, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlq $50, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_v4i64: ; XOPAVX1: # BB#0: @@ -873,12 +886,17 @@ ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $28, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_v8i32: ; XOPAVX1: # BB#0: @@ -1027,11 +1045,18 @@ ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v4i64: -; AVX512: # BB#0: -; AVX512-NEXT: vpsrlq $49, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: ; XOPAVX1: # BB#0: @@ -1082,14 +1107,18 @@ ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $28, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32: ; XOPAVX1: # BB#0: Index: test/CodeGen/X86/vector-rotate-512.ll =================================================================== --- test/CodeGen/X86/vector-rotate-512.ll +++ test/CodeGen/X86/vector-rotate-512.ll @@ -11,11 +11,7 @@ define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; AVX512-LABEL: var_rotate_v8i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [64,64,64,64,64,64,64,64] -; AVX512-NEXT: vpsubq %zmm1, %zmm2, %zmm2 -; AVX512-NEXT: vpsllvq %zmm1, %zmm0, %zmm1 -; AVX512-NEXT: vpsrlvq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %b64 = sub <8 x i64> , %b %shl = shl <8 x i64> %a, %b @@ -27,11 +23,7 @@ define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { ; AVX512-LABEL: var_rotate_v16i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512-NEXT: vpsubd %zmm1, %zmm2, %zmm2 -; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm1 -; AVX512-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %b32 = sub <16 x i32> , %b %shl = shl <16 x i32> %a, %b @@ -571,9 +563,7 @@ define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind { ; AVX512-LABEL: splatconstant_rotate_v8i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpsllq $14, %zmm0, %zmm1 -; AVX512-NEXT: vpsrlq $50, %zmm0, %zmm0 -; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vprolq $14, %zmm0, %zmm0 ; AVX512-NEXT: retq %shl = shl <8 x i64> %a, %lshr = lshr <8 x i64> %a, @@ -584,9 +574,7 @@ define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind { ; AVX512-LABEL: splatconstant_rotate_v16i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %zmm0, %zmm1 -; AVX512-NEXT: vpsrld $28, %zmm0, %zmm0 -; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vprold $4, %zmm0, %zmm0 ; AVX512-NEXT: retq %shl = shl <16 x i32> %a, %lshr = lshr <16 x i32> %a, @@ -697,7 +685,7 @@ define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind { ; AVX512-LABEL: splatconstant_rotate_mask_v8i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrlq $49, %zmm0, %zmm0 +; AVX512-NEXT: vprolq $15, %zmm0, %zmm0 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq %shl = shl <8 x i64> %a, @@ -711,11 +699,8 @@ define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind { ; AVX512-LABEL: splatconstant_rotate_mask_v16i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpslld $4, %zmm0, %zmm1 -; AVX512-NEXT: vpsrld $28, %zmm0, %zmm0 +; AVX512-NEXT: vprold $4, %zmm0, %zmm0 ; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm1, %zmm1 -; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %shl = shl <16 x i32> %a, %lshr = lshr <16 x i32> %a,