Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -21227,7 +21227,7 @@ bool LShift = VT.is128BitVector() || (VT.is256BitVector() && Subtarget.hasInt256()); - bool AShift = LShift && (Subtarget.hasVLX() || + bool AShift = LShift && (Subtarget.hasAVX512() || (VT != MVT::v2i64 && VT != MVT::v4i64)); return (Opcode == ISD::SRA) ? AShift : LShift; } @@ -21252,7 +21252,7 @@ if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) return false; - if (VT.is512BitVector() || Subtarget.hasVLX()) + if (Subtarget.hasAVX512()) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -4899,6 +4899,33 @@ defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; +// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + VR128X:$src2)), sub_ymm)>; + + def : Pat<(v2i64 (X86vsra (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + VR128X:$src2)), sub_xmm)>; + + def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; +} + //===-------------------------------------------------------------------===// // Variable Bit Shifts //===-------------------------------------------------------------------===// @@ -4932,6 +4959,7 @@ SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; } + multiclass avx512_var_shift_sizes opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in @@ -4955,12 +4983,13 @@ } // Use 512bit version to implement 128/256 bit in case NoVLX. -multiclass avx512_var_shift_w_lowering { - let Predicates = [HasBWI, NoVLX] in { +multiclass avx512_var_shift_lowering p> { + let Predicates = p in { def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), (_.info256.VT _.info256.RC:$src2))), (EXTRACT_SUBREG - (!cast(NAME#"WZrr") + (!cast(OpcodeStr#"Zrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; @@ -4968,13 +4997,12 @@ def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), (_.info128.VT _.info128.RC:$src2))), (EXTRACT_SUBREG - (!cast(NAME#"WZrr") + (!cast(OpcodeStr#"Zrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; } } - multiclass avx512_var_shift_w opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in @@ -4990,19 +5018,22 @@ } defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, - avx512_var_shift_w<0x12, "vpsllvw", shl>, - avx512_var_shift_w_lowering; + avx512_var_shift_w<0x12, "vpsllvw", shl>; defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, - avx512_var_shift_w<0x11, "vpsravw", sra>, - avx512_var_shift_w_lowering; + avx512_var_shift_w<0x11, "vpsravw", sra>; defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, - avx512_var_shift_w<0x10, "vpsrlvw", srl>, - avx512_var_shift_w_lowering; + avx512_var_shift_w<0x10, "vpsrlvw", srl>; + defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; + // Special handing for handling VPSRAV intrinsics. multiclass avx512_var_shift_int_lowering p> { Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -918,7 +918,10 @@ ; KNL: ## BB#0: ; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1] +; KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; KNL-NEXT: vpsllq $32, %xmm0, %xmm0 +; KNL-NEXT: vpsraq $32, %zmm0, %zmm0 +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL-NEXT: vcvtdq2ps %xmm0, %xmm0 ; KNL-NEXT: retq ; Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -491,8 +491,7 @@ ; KNL-LABEL: zext_2x8mem_to_2x64: ; KNL: ## BB#0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpsraq $63, %zmm0, %zmm0 ; KNL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq @@ -512,8 +511,7 @@ ; KNL-LABEL: sext_2x8mem_to_2x64mask: ; KNL: ## BB#0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpsraq $63, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxbq (%rdi), %xmm1 ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq @@ -872,8 +870,7 @@ ; KNL-LABEL: zext_2x16mem_to_2x64: ; KNL: ## BB#0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpsraq $63, %zmm0, %zmm0 ; KNL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq @@ -894,8 +891,7 @@ ; KNL-LABEL: sext_2x16mem_to_2x64mask: ; KNL: ## BB#0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpsraq $63, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxwq (%rdi), %xmm1 ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq @@ -1061,8 +1057,7 @@ ; KNL-LABEL: zext_2x32mem_to_2x64: ; KNL: ## BB#0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpsraq $63, %zmm0, %zmm0 ; KNL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq @@ -1083,8 +1078,7 @@ ; KNL-LABEL: sext_2x32mem_to_2x64mask: ; KNL: ## BB#0: ; KNL-NEXT: vpsllq $63, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; KNL-NEXT: vpsraq $63, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxdq (%rdi), %xmm1 ; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq Index: test/CodeGen/X86/avx512-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-vec-cmp.ll +++ test/CodeGen/X86/avx512-vec-cmp.ll @@ -1265,9 +1265,7 @@ ; KNL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; KNL-NEXT: vpsllq $32, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm1 -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; KNL-NEXT: vpsraq $32, %zmm0, %zmm0 ; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: retq ; Index: test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -18,8 +18,7 @@ ; CHECK-NEXT: vmovq %rax, %xmm3 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2 -; CHECK-NEXT: vpsrad $31, %xmm2, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: vpsraq $63, %zmm2, %zmm2 ; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 ; CHECK-NEXT: vandpd %xmm0, %xmm2, %xmm2 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 Index: test/CodeGen/X86/compress_expand.ll =================================================================== --- test/CodeGen/X86/compress_expand.ll +++ test/CodeGen/X86/compress_expand.ll @@ -200,8 +200,7 @@ ; KNL: # BB#0: ; KNL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: vpsllq $63, %xmm1, %xmm1 -; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 -; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; KNL-NEXT: vpsraq $63, %zmm1, %zmm1 ; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1 ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 Index: test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-128.ll +++ test/CodeGen/X86/vector-shift-ashr-128.ll @@ -83,11 +83,10 @@ ; ; AVX512-LABEL: var_shift_v2i64: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpsrlvq %xmm1, %xmm2, %xmm3 -; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: var_shift_v2i64: @@ -649,11 +648,9 @@ ; ; AVX512-LABEL: splatvar_shift_v2i64: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v2i64: @@ -1085,10 +1082,10 @@ ; ; AVX512-LABEL: constant_shift_v2i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,7] +; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: constant_shift_v2i64: @@ -1562,9 +1559,9 @@ ; ; AVX512-LABEL: splatconstant_shift_v2i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrad $7, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlq $7, %xmm0, %xmm0 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i64: Index: test/CodeGen/X86/vector-shift-ashr-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-256.ll +++ test/CodeGen/X86/vector-shift-ashr-256.ll @@ -71,11 +71,10 @@ ; ; AVX512-LABEL: var_shift_v4i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 -; AVX512-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 -; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsubq %ymm3, %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: var_shift_v4i64: @@ -491,11 +490,9 @@ ; ; AVX512-LABEL: splatvar_shift_v4i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 -; AVX512-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 -; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatvar_shift_v4i64: @@ -836,10 +833,10 @@ ; ; AVX512-LABEL: constant_shift_v4i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2] -; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,31,62] +; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: constant_shift_v4i64: @@ -1204,9 +1201,9 @@ ; ; AVX512-LABEL: splatconstant_shift_v4i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpsrad $7, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i64: