Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -310,6 +310,11 @@ // Vector shift elements VSHL, VSRL, VSRA, + // Vector variable shift right arithmetic. + // Unlike ISD::SRA, in case shift count greater then element size + // use sign bit to fill destination data element. + VSRAV, + // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -22022,6 +22022,7 @@ case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VSRAV: return "X86ISD::VSRAV"; case X86ISD::VROTLI: return "X86ISD::VROTLI"; case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::VPPERM: return "X86ISD::VPPERM"; Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -4297,9 +4297,14 @@ defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, avx512_var_shift_w<0x12, "vpsllvw", shl>, avx512_var_shift_w_lowering; + defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, avx512_var_shift_w<0x11, "vpsravw", sra>, avx512_var_shift_w_lowering; +let isCodeGenOnly = 1 in + defm VPSRAV_Int : avx512_var_shift_types<0x46, "vpsrav", X86vsrav>, + avx512_var_shift_w<0x11, "vpsravw", X86vsrav>; + defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, avx512_var_shift_w<0x10, "vpsrlvw", srl>, avx512_var_shift_w_lowering; Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -216,6 +216,8 @@ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVec<2>]>>; +def X86vsrav : SDNode<"X86ISD::VSRAV" , SDTIntShiftOp>; + def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -1547,6 +1547,8 @@ { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, + { X86::VPSRAVD_Intrr, X86::VPSRAVD_Intrm, 0 }, + { X86::VPSRAVD_IntYrr, X86::VPSRAVD_IntYrm, 0 }, { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -8764,6 +8764,8 @@ defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; + let isCodeGenOnly = 1 in + defm VPSRAVD_Int : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; } //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -312,8 +312,8 @@ X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0), X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), @@ -1365,15 +1365,15 @@ X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_q_128, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_q_256, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav16_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav32_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav4_si, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav8_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav8_si, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav16_hi, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav32_hi, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav4_si, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav8_hi, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav8_si, INTR_TYPE_2OP_MASK, X86ISD::VSRAV, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), Index: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1457,8 +1457,23 @@ %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } -declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone +define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) { +; AVX2-LABEL: test_x86_avx2_psrav_d_const: +; AVX2: ## BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] +; AVX2-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_psrav_d_const: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [2,9,4294967284,23] +; AVX512VL-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 +; AVX512VL-NEXT: retl + %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) { ; AVX2-LABEL: test_x86_avx2_psrav_d_256: @@ -1473,6 +1488,22 @@ %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } + +define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1) { +; AVX2-LABEL: test_x86_avx2_psrav_d_256_const: +; AVX2: ## BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; AVX2-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; AVX512VL-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 +; AVX512VL-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) + ret <8 x i32> %res +} declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) { Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2935,6 +2935,24 @@ ret <32 x i16> %res4 } +define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] +; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] +; AVX512F-32-NEXT: vpsravw {{\.LCPI.*}}, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> , + <32 x i16> , + <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) { Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -7950,6 +7950,19 @@ ret <8 x i32> %res4 } +define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() { +; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si_const: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovdqa32 {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; CHECK-NEXT: ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI510_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x46,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI510_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> , <8 x i32> , <8 x i32> zeroinitializer, i8 -1) + ret <8 x i32> %res +} + declare <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { @@ -7970,6 +7983,19 @@ ret <2 x i64> %res4 } +define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128_const: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovdqa64 {{.*#+}} xmm0 = [2,18446744073709551607] +; CHECK-NEXT: ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI512_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI512_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> , <2 x i64> , <2 x i64> zeroinitializer, i8 -1) + ret <2 x i64> %res +} + declare <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {