diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29996,6 +29996,33 @@ return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; } +// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate. +uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) { + switch (Opcode) { + case ISD::BITREVERSE: + return 0x8040201008040201ULL; + case ISD::SHL: + assert((0 < Amt && Amt < 8) && "Shift amount out of range"); + return ((0x0102040810204080ULL >> (Amt)) & + (0x0101010101010101ULL * (0xFF >> (Amt)))); + case ISD::SRL: + assert((0 < Amt && Amt < 8) && "Shift amount out of range"); + return ((0x0102040810204080ULL << (Amt)) & + (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF))); + case ISD::SRA: + assert((0 < Amt && Amt < 8) && "Shift amount out of range"); + return (getGFNICtrlImm(ISD::SRL, Amt) | + (0x8080808080808080ULL >> (64 - (8 * Amt)))); + case ISD::ROTL: + assert((0 < Amt && Amt < 8) && "Rotate amount out of range"); + return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt); + case ISD::ROTR: + assert((0 < Amt && Amt < 8) && "Rotate amount out of range"); + return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt); + } + llvm_unreachable("Unsupported GFNI opcode"); +} + // Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, @@ -30161,6 +30188,14 @@ if (VT == MVT::v16i8 && Subtarget.hasXOP()) return SDValue(); + if (Subtarget.hasGFNI()) { + uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt); + MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT)); + return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask, + DAG.getTargetConstant(0, dl, MVT::i8)); + } + if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, @@ -31021,6 +31056,18 @@ DAG.getNode(ISD::SUB, DL, VT, Z, Amt)); } + // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant. + if (IsCstSplat && Subtarget.hasGFNI() && + (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasAVX()) || + (VT == MVT::v64i8 && Subtarget.useBWIRegs()))) { + uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); + uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt); + MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT)); + return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask, + DAG.getTargetConstant(0, DL, MVT::i8)); + } + // Split 256-bit integers on XOP/pre-AVX2 targets. if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2())) return splitVectorIntBinary(Op, DAG); @@ -31942,7 +31989,8 @@ // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits. if (Subtarget.hasGFNI()) { MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8); - SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT); + SDValue Matrix = + DAG.getConstant(getGFNICtrlImm(ISD::BITREVERSE), DL, MatrixVT); Matrix = DAG.getBitcast(VT, Matrix); return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix, DAG.getTargetConstant(0, DL, MVT::i8)); diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -11,27 +11,23 @@ define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshl_v16i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $5, %xmm1 -; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; GFNISSE-NEXT: psllw $3, %xmm0 -; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: por %xmm1, %xmm0 ; GFNISSE-NEXT: retq ; ; GFNIAVX1OR2-LABEL: splatconstant_fshl_v16i8: ; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $5, %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpsllw $3, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_fshl_v16i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm2 -; GFNIAVX512-NEXT: vpsrlw $5, %xmm1, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) ret <16 x i8> %res @@ -41,25 +37,23 @@ define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshr_v16i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $7, %xmm1 -; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: por %xmm1, %xmm0 ; GFNISSE-NEXT: retq ; ; GFNIAVX1OR2-LABEL: splatconstant_fshr_v16i8: ; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_fshr_v16i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsrlw $7, %xmm1, %xmm1 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 ; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) ret <16 x i8> %res @@ -73,53 +67,37 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshl_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $4, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; GFNISSE-NEXT: movdqa %xmm4, %xmm5 -; GFNISSE-NEXT: pandn %xmm2, %xmm5 -; GFNISSE-NEXT: psllw $4, %xmm0 -; GFNISSE-NEXT: pand %xmm4, %xmm0 -; GFNISSE-NEXT: por %xmm5, %xmm0 -; GFNISSE-NEXT: psrlw $4, %xmm3 -; GFNISSE-NEXT: psllw $4, %xmm1 -; GFNISSE-NEXT: pand %xmm4, %xmm1 -; GFNISSE-NEXT: pandn %xmm3, %xmm4 -; GFNISSE-NEXT: por %xmm4, %xmm1 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [16909320,16909320] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSE-NEXT: por %xmm2, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSE-NEXT: por %xmm3, %xmm1 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_fshl_v32i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vpsllw $4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_fshl_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16909320,16909320,16909320,16909320] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_fshl_v32i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $4, %ymm0, %ymm2 -; GFNIAVX512-NEXT: vpsrlw $4, %ymm1, %ymm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> ) ret <32 x i8> %res @@ -129,53 +107,37 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshr_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $6, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; GFNISSE-NEXT: movdqa %xmm4, %xmm5 -; GFNISSE-NEXT: pandn %xmm2, %xmm5 -; GFNISSE-NEXT: psllw $2, %xmm0 -; GFNISSE-NEXT: pand %xmm4, %xmm0 -; GFNISSE-NEXT: por %xmm5, %xmm0 -; GFNISSE-NEXT: psrlw $6, %xmm3 -; GFNISSE-NEXT: psllw $2, %xmm1 -; GFNISSE-NEXT: pand %xmm4, %xmm1 -; GFNISSE-NEXT: pandn %xmm3, %xmm4 -; GFNISSE-NEXT: por %xmm4, %xmm1 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1108169199648,1108169199648] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSE-NEXT: por %xmm2, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSE-NEXT: por %xmm3, %xmm1 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_fshr_v32i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $6, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $6, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vpsllw $2, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_fshr_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $6, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpsllw $2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1108169199648,1108169199648,1108169199648,1108169199648] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_fshr_v32i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $2, %ymm0, %ymm2 -; GFNIAVX512-NEXT: vpsrlw $6, %ymm1, %ymm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> ) ret <32 x i8> %res @@ -189,45 +151,31 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $7, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNISSE-NEXT: pand %xmm8, %xmm4 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm4 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: por %xmm4, %xmm0 -; GFNISSE-NEXT: psrlw $7, %xmm5 -; GFNISSE-NEXT: pand %xmm8, %xmm5 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm5 ; GFNISSE-NEXT: paddb %xmm1, %xmm1 ; GFNISSE-NEXT: por %xmm5, %xmm1 -; GFNISSE-NEXT: psrlw $7, %xmm6 -; GFNISSE-NEXT: pand %xmm8, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm6 ; GFNISSE-NEXT: paddb %xmm2, %xmm2 ; GFNISSE-NEXT: por %xmm6, %xmm2 -; GFNISSE-NEXT: psrlw $7, %xmm7 -; GFNISSE-NEXT: pand %xmm7, %xmm8 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm7 ; GFNISSE-NEXT: paddb %xmm3, %xmm3 -; GFNISSE-NEXT: por %xmm8, %xmm3 +; GFNISSE-NEXT: por %xmm7, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_fshl_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4 +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 +; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm5 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2 ; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm3 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 @@ -237,22 +185,20 @@ ; ; GFNIAVX2-LABEL: splatconstant_fshl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $7, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpsrlw $7, %ymm3, %ymm2 -; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_fshl_v64i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsrlw $7, %zmm1, %zmm1 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; GFNIAVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> ) ret <64 x i8> %res @@ -262,84 +208,51 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $2, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] -; GFNISSE-NEXT: movdqa %xmm8, %xmm9 -; GFNISSE-NEXT: pandn %xmm4, %xmm9 -; GFNISSE-NEXT: psllw $6, %xmm0 -; GFNISSE-NEXT: pand %xmm8, %xmm0 -; GFNISSE-NEXT: por %xmm9, %xmm0 -; GFNISSE-NEXT: psrlw $2, %xmm5 -; GFNISSE-NEXT: movdqa %xmm8, %xmm4 -; GFNISSE-NEXT: pandn %xmm5, %xmm4 -; GFNISSE-NEXT: psllw $6, %xmm1 -; GFNISSE-NEXT: pand %xmm8, %xmm1 -; GFNISSE-NEXT: por %xmm4, %xmm1 -; GFNISSE-NEXT: psrlw $2, %xmm6 -; GFNISSE-NEXT: movdqa %xmm8, %xmm4 -; GFNISSE-NEXT: pandn %xmm6, %xmm4 -; GFNISSE-NEXT: psllw $6, %xmm2 -; GFNISSE-NEXT: pand %xmm8, %xmm2 -; GFNISSE-NEXT: por %xmm4, %xmm2 -; GFNISSE-NEXT: psrlw $2, %xmm7 -; GFNISSE-NEXT: psllw $6, %xmm3 -; GFNISSE-NEXT: pand %xmm8, %xmm3 -; GFNISSE-NEXT: pandn %xmm7, %xmm8 -; GFNISSE-NEXT: por %xmm8, %xmm3 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [290499906672525312,290499906672525312] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm4 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [258,258] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 +; GFNISSE-NEXT: por %xmm4, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm5 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm1 +; GFNISSE-NEXT: por %xmm5, %xmm1 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm2 +; GFNISSE-NEXT: por %xmm6, %xmm2 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm7 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm3 +; GFNISSE-NEXT: por %xmm7, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_fshr_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; GFNIAVX1-NEXT: vpsllw $6, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] -; GFNIAVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [258,258,258,258] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; GFNIAVX1-NEXT: vpsllw $6, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpsllw $6, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $2, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] -; GFNIAVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 -; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [258,258,258,258] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpsrlw $2, %ymm3, %ymm2 -; GFNIAVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 -; GFNIAVX2-NEXT: vpsllw $6, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_fshr_v64i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $6, %zmm0, %zmm2 -; GFNIAVX512-NEXT: vpsrlw $2, %zmm1, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; GFNIAVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -11,28 +11,17 @@ define <16 x i8> @splatconstant_rotl_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotl_v16i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa %xmm0, %xmm1 -; GFNISSE-NEXT: psrlw $5, %xmm1 -; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; GFNISSE-NEXT: psllw $3, %xmm0 -; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; GFNISSE-NEXT: por %xmm1, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; ; GFNIAVX1OR2-LABEL: splatconstant_rotl_v16i8: ; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $5, %xmm0, %xmm1 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpsllw $3, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotl_v16i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm1 -; GFNIAVX512-NEXT: vpsrlw $5, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) ret <16 x i8> %res @@ -42,26 +31,17 @@ define <16 x i8> @splatconstant_rotr_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotr_v16i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa %xmm0, %xmm1 -; GFNISSE-NEXT: psrlw $7, %xmm1 -; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: por %xmm1, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; ; GFNIAVX1OR2-LABEL: splatconstant_rotr_v16i8: ; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $7, %xmm0, %xmm1 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX1OR2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotr_v16i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsrlw $7, %xmm0, %xmm1 -; GFNIAVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) ret <16 x i8> %res @@ -75,53 +55,25 @@ define <32 x i8> @splatconstant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotl_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa %xmm0, %xmm2 -; GFNISSE-NEXT: psrlw $4, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; GFNISSE-NEXT: movdqa %xmm3, %xmm4 -; GFNISSE-NEXT: pandn %xmm2, %xmm4 -; GFNISSE-NEXT: psllw $4, %xmm0 -; GFNISSE-NEXT: pand %xmm3, %xmm0 -; GFNISSE-NEXT: por %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa %xmm1, %xmm2 -; GFNISSE-NEXT: psrlw $4, %xmm2 -; GFNISSE-NEXT: psllw $4, %xmm1 -; GFNISSE-NEXT: pand %xmm3, %xmm1 -; GFNISSE-NEXT: pandn %xmm2, %xmm3 -; GFNISSE-NEXT: por %xmm3, %xmm1 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1161999622378488840,1161999622378488840] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_rotl_v32i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; GFNIAVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 -; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_rotl_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpsllw $4, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1161999622378488840,1161999622378488840,1161999622378488840,1161999622378488840] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotl_v32i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $4, %ymm0, %ymm1 -; GFNIAVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> ) ret <32 x i8> %res @@ -131,53 +83,25 @@ define <32 x i8> @splatconstant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotr_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa %xmm0, %xmm2 -; GFNISSE-NEXT: psrlw $6, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; GFNISSE-NEXT: movdqa %xmm3, %xmm4 -; GFNISSE-NEXT: pandn %xmm2, %xmm4 -; GFNISSE-NEXT: psllw $2, %xmm0 -; GFNISSE-NEXT: pand %xmm3, %xmm0 -; GFNISSE-NEXT: por %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa %xmm1, %xmm2 -; GFNISSE-NEXT: psrlw $6, %xmm2 -; GFNISSE-NEXT: psllw $2, %xmm1 -; GFNISSE-NEXT: pand %xmm3, %xmm1 -; GFNISSE-NEXT: pandn %xmm2, %xmm3 -; GFNISSE-NEXT: por %xmm3, %xmm1 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [4647715923615551520,4647715923615551520] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_rotr_v32i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; GFNIAVX1-NEXT: vpsrlw $6, %xmm1, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; GFNIAVX1-NEXT: vpsllw $2, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpsrlw $6, %xmm0, %xmm2 -; GFNIAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_rotr_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $6, %ymm0, %ymm1 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpsllw $2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4647715923615551520,4647715923615551520,4647715923615551520,4647715923615551520] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotr_v32i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $2, %ymm0, %ymm1 -; GFNIAVX512-NEXT: vpsrlw $6, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> ) ret <32 x i8> %res @@ -191,72 +115,30 @@ define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa %xmm0, %xmm4 -; GFNISSE-NEXT: psrlw $7, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNISSE-NEXT: pand %xmm5, %xmm4 -; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: por %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa %xmm1, %xmm4 -; GFNISSE-NEXT: psrlw $7, %xmm4 -; GFNISSE-NEXT: pand %xmm5, %xmm4 -; GFNISSE-NEXT: paddb %xmm1, %xmm1 -; GFNISSE-NEXT: por %xmm4, %xmm1 -; GFNISSE-NEXT: movdqa %xmm2, %xmm4 -; GFNISSE-NEXT: psrlw $7, %xmm4 -; GFNISSE-NEXT: pand %xmm5, %xmm4 -; GFNISSE-NEXT: paddb %xmm2, %xmm2 -; GFNISSE-NEXT: por %xmm4, %xmm2 -; GFNISSE-NEXT: movdqa %xmm3, %xmm4 -; GFNISSE-NEXT: psrlw $7, %xmm4 -; GFNISSE-NEXT: pand %xmm5, %xmm4 -; GFNISSE-NEXT: paddb %xmm3, %xmm3 -; GFNISSE-NEXT: por %xmm4, %xmm3 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9223655728169885760,9223655728169885760] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_rotl_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm1, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [9223655728169885760,9223655728169885760,9223655728169885760,9223655728169885760] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_rotl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpsrlw $7, %ymm1, %ymm2 -; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223655728169885760,9223655728169885760,9223655728169885760,9223655728169885760] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotl_v64i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsrlw $7, %zmm0, %zmm1 -; GFNIAVX512-NEXT: vpaddb %zmm0, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> ) ret <64 x i8> %res @@ -266,85 +148,30 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa %xmm0, %xmm5 -; GFNISSE-NEXT: psrlw $2, %xmm5 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] -; GFNISSE-NEXT: movdqa %xmm4, %xmm6 -; GFNISSE-NEXT: pandn %xmm5, %xmm6 -; GFNISSE-NEXT: psllw $6, %xmm0 -; GFNISSE-NEXT: pand %xmm4, %xmm0 -; GFNISSE-NEXT: por %xmm6, %xmm0 -; GFNISSE-NEXT: movdqa %xmm1, %xmm5 -; GFNISSE-NEXT: psrlw $2, %xmm5 -; GFNISSE-NEXT: movdqa %xmm4, %xmm6 -; GFNISSE-NEXT: pandn %xmm5, %xmm6 -; GFNISSE-NEXT: psllw $6, %xmm1 -; GFNISSE-NEXT: pand %xmm4, %xmm1 -; GFNISSE-NEXT: por %xmm6, %xmm1 -; GFNISSE-NEXT: movdqa %xmm2, %xmm5 -; GFNISSE-NEXT: psrlw $2, %xmm5 -; GFNISSE-NEXT: movdqa %xmm4, %xmm6 -; GFNISSE-NEXT: pandn %xmm5, %xmm6 -; GFNISSE-NEXT: psllw $6, %xmm2 -; GFNISSE-NEXT: pand %xmm4, %xmm2 -; GFNISSE-NEXT: por %xmm6, %xmm2 -; GFNISSE-NEXT: movdqa %xmm3, %xmm5 -; GFNISSE-NEXT: psrlw $2, %xmm5 -; GFNISSE-NEXT: psllw $6, %xmm3 -; GFNISSE-NEXT: pand %xmm4, %xmm3 -; GFNISSE-NEXT: pandn %xmm5, %xmm4 -; GFNISSE-NEXT: por %xmm4, %xmm3 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [290499906672525570,290499906672525570] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_rotr_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] -; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 -; GFNIAVX1-NEXT: vpsllw $6, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 -; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 -; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 -; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 -; GFNIAVX1-NEXT: vpsllw $6, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm3 -; GFNIAVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 -; GFNIAVX1-NEXT: vpsllw $6, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [290499906672525570,290499906672525570,290499906672525570,290499906672525570] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_rotr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] -; GFNIAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpsrlw $2, %ymm1, %ymm2 -; GFNIAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; GFNIAVX2-NEXT: vpsllw $6, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [290499906672525570,290499906672525570,290499906672525570,290499906672525570] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotr_v64i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $6, %zmm0, %zmm1 -; GFNIAVX512-NEXT: vpsrlw $2, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+gfni | FileCheck %s --check-prefixes=GFNISSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX1OR2,GFNIAVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX1OR2,GFNIAVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+gfni | FileCheck %s --check-prefixes=GFNIAVX,GFNIAVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=GFNIAVX1OR2,GFNIAVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=GFNIAVX1OR2,GFNIAVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+gfni | FileCheck %s --check-prefixes=GFNIAVX512 ; ; 128 Bit Vector Shifts @@ -11,15 +11,18 @@ define <16 x i8> @splatconstant_shl_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_shl_v16i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psllw $3, %xmm0 -; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: splatconstant_shl_v16i8: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1OR2-LABEL: splatconstant_shl_v16i8: +; GFNIAVX1OR2: # %bb.0: +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_shl_v16i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %shift = shl <16 x i8> %a, ret <16 x i8> %shift } @@ -27,15 +30,18 @@ define <16 x i8> @splatconstant_lshr_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_lshr_v16i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $7, %xmm0 -; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX-LABEL: splatconstant_lshr_v16i8: -; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: vpsrlw $7, %xmm0, %xmm0 -; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX-NEXT: retq +; GFNIAVX1OR2-LABEL: splatconstant_lshr_v16i8: +; GFNIAVX1OR2: # %bb.0: +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: retq +; +; GFNIAVX512-LABEL: splatconstant_lshr_v16i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; GFNIAVX512-NEXT: retq %shift = lshr <16 x i8> %a, ret <16 x i8> %shift } @@ -43,28 +49,17 @@ define <16 x i8> @splatconstant_ashr_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_ashr_v16i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $4, %xmm0 -; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; GFNISSE-NEXT: pxor %xmm1, %xmm0 -; GFNISSE-NEXT: psubb %xmm1, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: retq ; ; GFNIAVX1OR2-LABEL: splatconstant_ashr_v16i8: ; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; GFNIAVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_ashr_v16i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; GFNIAVX512-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; GFNIAVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; GFNIAVX512-NEXT: retq %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -77,34 +72,25 @@ define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_shl_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psllw $6, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] -; GFNISSE-NEXT: pand %xmm2, %xmm0 -; GFNISSE-NEXT: psllw $6, %xmm1 -; GFNISSE-NEXT: pand %xmm2, %xmm1 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [258,258] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_shl_v32i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; GFNIAVX1-NEXT: vpsllw $6, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] -; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpsllw $6, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_shl_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [258,258,258,258] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_shl_v32i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $6, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %shift = shl <32 x i8> %a, ret <32 x i8> %shift @@ -113,34 +99,25 @@ define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_lshr_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $1, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; GFNISSE-NEXT: pand %xmm2, %xmm0 -; GFNISSE-NEXT: psrlw $1, %xmm1 -; GFNISSE-NEXT: pand %xmm2, %xmm1 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [145249953336295424,145249953336295424] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_lshr_v32i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_lshr_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_lshr_v32i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -149,49 +126,25 @@ define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_ashr_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $2, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; GFNISSE-NEXT: pand %xmm2, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; GFNISSE-NEXT: pxor %xmm3, %xmm0 -; GFNISSE-NEXT: psubb %xmm3, %xmm0 -; GFNISSE-NEXT: psrlw $2, %xmm1 -; GFNISSE-NEXT: pand %xmm2, %xmm1 -; GFNISSE-NEXT: pxor %xmm3, %xmm1 -; GFNISSE-NEXT: psubb %xmm3, %xmm1 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [290499906672558208,290499906672558208] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_ashr_v32i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; GFNIAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; GFNIAVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_ashr_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; GFNIAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [290499906672558208,290499906672558208,290499906672558208,290499906672558208] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_ashr_v32i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsrlw $2, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; GFNIAVX512-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; GFNIAVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -204,47 +157,30 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_shl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] -; GFNISSE-NEXT: pand %xmm4, %xmm0 -; GFNISSE-NEXT: psllw $5, %xmm1 -; GFNISSE-NEXT: pand %xmm4, %xmm1 -; GFNISSE-NEXT: psllw $5, %xmm2 -; GFNISSE-NEXT: pand %xmm4, %xmm2 -; GFNISSE-NEXT: psllw $5, %xmm3 -; GFNISSE-NEXT: pand %xmm4, %xmm3 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [66052,66052] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_shl_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsllw $5, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsllw $5, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [66052,66052,66052,66052] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_shl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsllw $5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] -; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [66052,66052,66052,66052] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_shl_v64i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsllw $5, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %shift = shl <64 x i8> %a, ret <64 x i8> %shift @@ -253,47 +189,30 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_lshr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $7, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNISSE-NEXT: pand %xmm4, %xmm0 -; GFNISSE-NEXT: psrlw $7, %xmm1 -; GFNISSE-NEXT: pand %xmm4, %xmm1 -; GFNISSE-NEXT: psrlw $7, %xmm2 -; GFNISSE-NEXT: pand %xmm4, %xmm2 -; GFNISSE-NEXT: psrlw $7, %xmm3 -; GFNISSE-NEXT: pand %xmm4, %xmm3 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_lshr_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_lshr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpsrlw $7, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_lshr_v64i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsrlw $7, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %shift = lshr <64 x i8> %a, ret <64 x i8> %shift @@ -302,72 +221,30 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_ashr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: psrlw $1, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; GFNISSE-NEXT: pand %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; GFNISSE-NEXT: pxor %xmm5, %xmm0 -; GFNISSE-NEXT: psubb %xmm5, %xmm0 -; GFNISSE-NEXT: psrlw $1, %xmm1 -; GFNISSE-NEXT: pand %xmm4, %xmm1 -; GFNISSE-NEXT: pxor %xmm5, %xmm1 -; GFNISSE-NEXT: psubb %xmm5, %xmm1 -; GFNISSE-NEXT: psrlw $1, %xmm2 -; GFNISSE-NEXT: pand %xmm4, %xmm2 -; GFNISSE-NEXT: pxor %xmm5, %xmm2 -; GFNISSE-NEXT: psubb %xmm5, %xmm2 -; GFNISSE-NEXT: psrlw $1, %xmm3 -; GFNISSE-NEXT: pand %xmm4, %xmm3 -; GFNISSE-NEXT: pxor %xmm5, %xmm3 -; GFNISSE-NEXT: psubb %xmm5, %xmm3 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [145249953336295552,145249953336295552] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatconstant_ashr_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; GFNIAVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpsubb %xmm4, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [145249953336295552,145249953336295552,145249953336295552,145249953336295552] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_ashr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; GFNIAVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [145249953336295552,145249953336295552,145249953336295552,145249953336295552] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_ashr_v64i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpsrlw $1, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; GFNIAVX512-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 -; GFNIAVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -5,10 +5,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-VBMI1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled. @@ -1731,12 +1731,24 @@ } define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: splatconstant_rotate_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 -; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: splatconstant_rotate_v32i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1 +; CHECK-AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 +; CHECK-AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI1-LABEL: splatconstant_rotate_v32i8: +; CHECK-VBMI1: # %bb.0: +; CHECK-VBMI1-NEXT: vpsllw $4, %ymm0, %ymm1 +; CHECK-VBMI1-NEXT: vpsrlw $4, %ymm0, %ymm0 +; CHECK-VBMI1-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; CHECK-VBMI1-NEXT: retq +; +; CHECK-GFNI-LABEL: splatconstant_rotate_v32i8: +; CHECK-GFNI: # %bb.0: +; CHECK-GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; CHECK-GFNI-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, %or = or <32 x i8> %shl, %lshr @@ -1744,13 +1756,27 @@ } define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: splatconstant_rotate_mask_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 -; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: splatconstant_rotate_mask_v32i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1 +; CHECK-AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 +; CHECK-AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI1-LABEL: splatconstant_rotate_mask_v32i8: +; CHECK-VBMI1: # %bb.0: +; CHECK-VBMI1-NEXT: vpsllw $4, %ymm0, %ymm1 +; CHECK-VBMI1-NEXT: vpsrlw $4, %ymm0, %ymm0 +; CHECK-VBMI1-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; CHECK-VBMI1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-VBMI1-NEXT: retq +; +; CHECK-GFNI-LABEL: splatconstant_rotate_mask_v32i8: +; CHECK-GFNI: # %bb.0: +; CHECK-GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; CHECK-GFNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-GFNI-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, %rmask = and <32 x i8> %lshr,