Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -17071,6 +17071,44 @@ return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } +static SDValue SRAFoldConstant(SDLoc DL, EVT VT, SDValue Cst1, SDValue Cst2, + SelectionDAG &DAG) { + + // For vectors extract each constant element so we can constant + // fold them individually. + BuildVectorSDNode *BV1 = dyn_cast(Cst1.getNode()); + BuildVectorSDNode *BV2 = dyn_cast(Cst2.getNode()); + if (!BV1 || !BV2) + return SDValue(); + + assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!"); + + EVT SVT = VT.getScalarType(); + SmallVector Outputs; + for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) { + ConstantSDNode *V1 = dyn_cast(BV1->getOperand(I)); + ConstantSDNode *V2 = dyn_cast(BV2->getOperand(I)); + if (!V1 || !V2) // Not a constant, bail. + return SDValue(); + + if (V1->isOpaque() || V2->isOpaque()) + return SDValue(); + + if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT) + return SDValue(); + + // Fold one vector element. + const APInt &C1 = V1->getAPIntValue(); + const APInt &C2 = V2->getAPIntValue(); + unsigned shiftAmt = C2.getLimitedValue(C1.getBitWidth() - 1); + APInt Val = C1.ashr(shiftAmt); + Outputs.push_back(DAG.getConstant(Val, DL, SVT)); + } + + // Build a big vector out of the scalar elements we generated. + return DAG.getBuildVector(VT, SDLoc(), Outputs); +} + static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -17683,6 +17721,27 @@ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } + case INTR_SRA_MASK: + case INTR_SRA: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + //From SPEC: If the value specified in the respective data element of + // count is greater than element size then the destination data element + // are filled with the corresponding sign bit of the source element. + // This behavior is different from LLVM SRA (in such case the res is undef). + // Perform Constant folding before SRA creation. + SDValue SRA = SRAFoldConstant(dl, VT, Src1, Src2, DAG); + + if(!SRA.getNode()) + SRA = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); + + if (IntrData->Type == INTR_SRA) + return SRA; + + return getVectorMaskingNode(SRA, Op.getOperand(4), Op.getOperand(3), + Subtarget, DAG); + } + default: break; } Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -35,7 +35,8 @@ TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, STOREANT, BLEND, INSERT_SUBVEC, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, - FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK + FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, + INTR_SRA_MASK, INTR_SRA }; struct IntrinsicData { @@ -321,8 +322,8 @@ X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrav_d, INTR_SRA, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_SRA, ISD::SRA, 0), X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), @@ -1434,15 +1435,15 @@ X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_q_128, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav_q_256, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav16_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav32_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav4_si, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav8_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx512_mask_psrav8_si, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_SRA_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_SRA_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q_128, INTR_SRA_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q_256, INTR_SRA_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav16_hi, INTR_SRA_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav32_hi, INTR_SRA_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav4_si, INTR_SRA_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav8_hi, INTR_SRA_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav8_si, INTR_SRA_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1457,8 +1457,21 @@ %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } -declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone +define <4 x i32> @test_x86_avx2_psrav_d_fold(<4 x i32> %a0, <4 x i32> %a1) { +; AVX2-LABEL: test_x86_avx2_psrav_d_fold: +; AVX2: ## BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,4294967295,0] +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_psrav_d_fold: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [1,0,4294967295,0] +; AVX512VL-NEXT: retl + %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) { ; AVX2-LABEL: test_x86_avx2_psrav_d_256: @@ -1473,6 +1486,20 @@ %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } + +define <8 x i32> @test_x86_avx2_psrav_d_256_fold(<8 x i32> %a0, <8 x i32> %a1) { +; AVX2-LABEL: test_x86_avx2_psrav_d_256_fold: +; AVX2: ## BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [1,0,4294967295,0,4294967295,0,4294967295,0] +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_fold: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [1,0,4294967295,0,4294967295,0,4294967295,0] +; AVX512VL-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) + ret <8 x i32> %res +} declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) { Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -3107,6 +3107,22 @@ ret <32 x i16> %res4 } +define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_fold(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_fold: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm0 = [1,0,65535,0,65535,0,65535,0,1,0,65535,0,65535,0,65535,0,1,0,65535,0,65535,0,65535,0,1,0,65535,0,65535,0,65535,0] +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_fold: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [1,0,65535,0,65535,0,65535,0,1,0,65535,0,65535,0,65535,0,1,0,65535,0,65535,0,65535,0,1,0,65535,0,65535,0,65535,0] +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> , + <32 x i16> , + <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) { Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -8063,6 +8063,17 @@ ret <8 x i32> %res4 } +define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_fold() { +; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si_fold: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovdqa32 {{.*#+}} ymm0 = [1,0,4294967295,0,4294967295,0,4294967295,0] +; CHECK-NEXT: ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI520_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> , <8 x i32> , <8 x i32> zeroinitializer, i8 -1) + ret <8 x i32> %res +} + declare <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { @@ -8083,6 +8094,17 @@ ret <2 x i64> %res4 } +define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_fold(i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128_fold: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovdqa64 {{.*#+}} xmm0 = [1,18446744073709551615] +; CHECK-NEXT: ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x05,A,A,A,A] +; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI522_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> , <2 x i64> , <2 x i64> zeroinitializer, i8 -1) + ret <2 x i64> %res +} + declare <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {