Index: llvm/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsX86.td +++ llvm/include/llvm/IR/IntrinsicsX86.td @@ -411,9 +411,6 @@ def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">, - Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem, Commutative]>; } // Integer shift ops. @@ -1660,9 +1657,6 @@ def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">, - Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; } // Integer shift ops. @@ -4689,9 +4683,6 @@ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">, - Intrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty], - [IntrNoMem, Commutative]>; } // Integer arithmetic ops let TargetPrefix = "x86" in { Index: llvm/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/lib/IR/AutoUpgrade.cpp +++ llvm/lib/IR/AutoUpgrade.cpp @@ -298,7 +298,10 @@ Name.startswith("avx512.ptestnm") || //Added in 6.0 Name.startswith("sse2.pavg") || // Added in 6.0 Name.startswith("avx2.pavg") || // Added in 6.0 - Name.startswith("avx512.mask.pavg")) // Added in 6.0 + Name.startswith("avx512.mask.pavg") || // Added in 6.0 + Name == "sse2.psad.bw" || // Added in 7.0 + Name == "avx2.psad.bw" || // Added in 7.0 + Name == "avx512.psad.bw.512") // Added in 7.0 return true; return false; @@ -1182,6 +1185,35 @@ return; } +// Upgrades calls to PSAD intrinsics. +static Value *UpgradeX86SAD(IRBuilder<> &Builder, CallInst &CI) { + // The operands arrive already bitcast to byte vectors. + Value *A = CI.getArgOperand(0); + Value *B = CI.getArgOperand(1); + // N shows the corresponding number of qwords. + unsigned N = (cast(CI.getType()))->getBitWidth() / 64; + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_UGT, A, B); + Value *AD = Builder.CreateSelect(Cmp, Builder.CreateSub(A, B), + Builder.CreateSub(B, A)); + + Type *QTy = Builder.getInt64Ty(); + Type *VTy = VectorType::get(QTy, N); + SmallVector ShuffleMask(N); + for (unsigned i = 0; i < N; ++i) + ShuffleMask[i] = i * 8; + Value *Res = Builder.CreateZExt( + Builder.CreateShuffleVector(AD, AD, ShuffleMask), VTy); + for (unsigned i = 1; i < 8; ++i) { + for (unsigned j = 0; j < N; ++j) + ShuffleMask[j] = i + j * 8; + Value *Sum = Builder.CreateShuffleVector(AD, AD, ShuffleMask); + Res = Builder.CreateAdd(Res, Builder.CreateZExt(Sum, VTy)); + } + return Res; + +} + /// Upgrade a call to an old intrinsic. All argument and return casting must be /// provided to seamlessly integrate with existing context. void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { @@ -2342,6 +2374,12 @@ } else if (IsX86 && Name.startswith("avx512.mask.") && upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) { // Rep will be updated by the call in the condition. + } else if (IsX86 && + (Name.startswith("sse2.psad") || Name.startswith("avx2.psad") || + Name.startswith("avx512.psad"))) { + // llvm.x86.sse2.psad.bw, llvm.x86.avx2.psad.bw, + // llvm.x86.avx512.psad.bw.512 + Rep = UpgradeX86SAD(Builder, *CI); } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) { Value *Arg = CI->getArgOperand(0); Value *Neg = Builder.CreateNeg(Arg, "neg"); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38051,9 +38051,109 @@ PMADDBuilder); } +// Try to find a sum-of-shuffles pattern for PSADBW. +static SDValue combineScalarSADPattern(SDNode *Node, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (Node->getOpcode() != ISD::ADD) + return SDValue(); + EVT Ty = Node->getValueType(0); + if (!Ty.isVector() || Ty.getScalarSizeInBits() != 64) + return SDValue(); + unsigned N = Ty.getVectorNumElements(); + if ((N != 2 && N != 4 && N != 8) || (N == 2 && !Subtarget.hasSSE2()) || + (N == 4 && !Subtarget.hasAVX2()) || + (N == 8 && (!Subtarget.hasAVX512() || !Subtarget.hasBWI()))) + return SDValue(); + SDValue ByteGroup[8], Cap, Top; + bool TopSpotted = false; + ByteGroup[0] = SDValue(Node, 0); + bool ByteGroupDone[8] = {false, false, false, false, + false, false, false, false}; + unsigned NumByteGroups = 1; + // For the vector SAD, we expect the following pattern: nodes are extracted + // by vectors of 0-7 offsets, creating 8 vectors of byte differences that are + // then summed. + while (NumByteGroups) { + // If it's a sum, go up the tree. + if (ByteGroup[NumByteGroups - 1].getOpcode() == ISD::ADD) { + if (NumByteGroups == 8) + return SDValue(); + ByteGroup[NumByteGroups] = ByteGroup[NumByteGroups - 1].getOperand(1); + ByteGroup[NumByteGroups - 1] = ByteGroup[NumByteGroups - 1].getOperand(0); + ++NumByteGroups; + // Check that the pattern above zext corresponds to a shuffle of the AD + // pattern's output with the shuffle mask. + } else if (ByteGroup[NumByteGroups - 1].getOpcode() == ISD::ZERO_EXTEND && + ByteGroup[NumByteGroups - 1] + .getValueType() + .getVectorNumElements() == N) { + --NumByteGroups; + Cap = ByteGroup[NumByteGroups].getOperand(0); + if (Cap.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + if (Cap.getNumOperands() < N) + return SDValue(); + SDValue Extract = Cap.getOperand(0); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + if (!TopSpotted) { + Top = Extract.getOperand(0); + TopSpotted = true; + } else if (Extract.getOperand(0) != Top) + return SDValue(); + auto C = dyn_cast(Extract.getOperand(1).getNode()); + if (!C) + return SDValue(); + unsigned Pos = C->getZExtValue(); + if (Pos >= 8 || ByteGroupDone[Pos]) + return SDValue(); + for (unsigned i = 1; i < N; ++i) { + Extract = Cap.getOperand(i); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Extract.getOperand(0) != Top) + return SDValue(); + auto C = dyn_cast(Extract.getOperand(1).getNode()); + if (!C || C->getZExtValue() != Pos + 8 * i) + return SDValue(); + } + ByteGroupDone[Pos] = true; + } else + return SDValue(); + } + // Check that we've found 8 shuffles. + for (unsigned i = 0; i < 8; ++i) + if (!ByteGroupDone[i]) + return SDValue(); + // Check that Top points to an absolute difference pattern, that is + // AD[i] = (A[i] > B[i]) ? A[i] - B[i] : B[i] - A[i] + if (Top.getOpcode() != ISD::VSELECT) + return SDValue(); + SDValue SubP = Top.getOperand(1); + SDValue SubN = Top.getOperand(2); + if (SubP.getOpcode() != ISD::SUB || SubN.getOpcode() != ISD::SUB) + return SDValue(); + Top = Top.getOperand(0); + if (Top.getOpcode() != ISD::SETCC) + return SDValue(); + ISD::CondCode CC = cast(Top.getOperand(2))->get(); + if (CC != ISD::SETUGT && CC != ISD::SETUGE) + return SDValue(); + SDValue A = Top.getOperand(0); + SDValue B = Top.getOperand(1); + if (SubP.getOperand(0) != A || SubP.getOperand(1) != B || + SubN.getOperand(0) != B || SubN.getOperand(1) != A) + return SDValue(); + + SDLoc DL(Node); + MVT VT = MVT::getVectorVT(MVT::i64, N); + return DAG.getNode(X86ISD::PSADBW, DL, VT, A, B); +} + static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { const SDNodeFlags Flags = N->getFlags(); + if (SDValue VSad = combineScalarSADPattern(N, DAG, Subtarget)) + return VSad; if (Flags.hasVectorReduction()) { if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) return Sad; Index: llvm/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -416,7 +416,6 @@ X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), - X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), @@ -1449,7 +1448,6 @@ X86_INTRINSIC_DATA(avx512_pmul_hr_sw_512, INTR_TYPE_2OP, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(avx512_pmulh_w_512, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx512_pmulhu_w_512, INTR_TYPE_2OP, ISD::MULHU, 0), - X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx512_psll_q_512, INTR_TYPE_2OP, X86ISD::VSHL, 0), @@ -1610,7 +1608,6 @@ X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), - X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(sse2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), Index: llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp =================================================================== --- llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2695,8 +2695,6 @@ break; case Intrinsic::x86_mmx_psad_bw: - case Intrinsic::x86_sse2_psad_bw: - case Intrinsic::x86_avx2_psad_bw: handleVectorSadIntrinsic(I); break; Index: llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -2034,16 +2034,88 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) { -; CHECK-LABEL: test_mm256_sad_epu8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: test_mm256_sad_epu8: +; X86: # %bb.0: +; X86-NEXT: vpminub %ymm1, %ymm0, %ymm2 +; X86-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; X86-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; X86-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpsubb %ymm1, %ymm0, %ymm3 +; X86-NEXT: vpsubb %ymm0, %ymm1, %ymm0 +; X86-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm1 +; X86-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,zero,ymm0[25],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[2],zero,zero,zero,zero,zero,zero,zero,ymm0[10],zero,zero,zero,zero,zero,zero,zero,ymm0[18],zero,zero,zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[3],zero,zero,zero,zero,zero,zero,zero,ymm0[11],zero,zero,zero,zero,zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,zero,zero,zero,zero,zero,zero,ymm0[12],zero,zero,zero,zero,zero,zero,zero,ymm0[20],zero,zero,zero,zero,zero,zero,zero,ymm0[28],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[5],zero,zero,zero,zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,zero,zero,zero,ymm0[21],zero,zero,zero,zero,zero,zero,zero,ymm0[29],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,zero,ymm0[22],zero,zero,zero,zero,zero,zero,zero,ymm0[30],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpsrlq $56, %ymm0, %ymm0 +; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; X86-NEXT: ret{{[l|q]}} +; +; X64-LABEL: test_mm256_sad_epu8: +; X64: # %bb.0: +; X64-NEXT: vpminub %ymm1, %ymm0, %ymm2 +; X64-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; X64-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; X64-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm3 +; X64-NEXT: vpsubb %ymm0, %ymm1, %ymm0 +; X64-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm1 +; X64-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,zero,ymm0[25],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[2],zero,zero,zero,zero,zero,zero,zero,ymm0[10],zero,zero,zero,zero,zero,zero,zero,ymm0[18],zero,zero,zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X64-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[3],zero,zero,zero,zero,zero,zero,zero,ymm0[11],zero,zero,zero,zero,zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X64-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,zero,zero,zero,zero,zero,zero,ymm0[12],zero,zero,zero,zero,zero,zero,zero,ymm0[20],zero,zero,zero,zero,zero,zero,zero,ymm0[28],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X64-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[5],zero,zero,zero,zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,zero,zero,zero,ymm0[21],zero,zero,zero,zero,zero,zero,zero,ymm0[29],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X64-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,zero,ymm0[22],zero,zero,zero,zero,zero,zero,zero,ymm0[30],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X64-NEXT: vpsrlq $56, %ymm0, %ymm0 +; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; X64-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1) + %1 = icmp ugt <32 x i8> %arg0, %arg1 + %2 = sub <32 x i8> %arg0, %arg1 + %3 = sub <32 x i8> %arg1, %arg0 + %4 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> %3 + %5 = shufflevector <32 x i8> %4, <32 x i8> %4, <4 x i32> + %6 = zext <4 x i8> %5 to <4 x i64> + %7 = shufflevector <32 x i8> %4, <32 x i8> %4, <4 x i32> + %8 = zext <4 x i8> %7 to <4 x i64> + %9 = add <4 x i64> %6, %8 + %10 = shufflevector <32 x i8> %4, <32 x i8> %4, <4 x i32> + %11 = zext <4 x i8> %10 to <4 x i64> + %12 = add <4 x i64> %9, %11 + %13 = shufflevector <32 x i8> %4, <32 x i8> %4, <4 x i32> + %14 = zext <4 x i8> %13 to <4 x i64> + %15 = add <4 x i64> %12, %14 + %16 = shufflevector <32 x i8> %4, <32 x i8> %4, <4 x i32> + %17 = zext <4 x i8> %16 to <4 x i64> + %18 = add <4 x i64> %15, %17 + %19 = shufflevector <32 x i8> %4, <32 x i8> %4, <4 x i32> + %20 = zext <4 x i8> %19 to <4 x i64> + %21 = add <4 x i64> %18, %20 + %22 = shufflevector <32 x i8> %4, <32 x i8> %4, <4 x i32> + %23 = zext <4 x i8> %22 to <4 x i64> + %24 = add <4 x i64> %21, %23 + %25 = shufflevector <32 x i8> %4, <32 x i8> %4, <4 x i32> + %26 = zext <4 x i8> %25 to <4 x i64> + %res = add <4 x i64> %24, %26 ret <4 x i64> %res } -declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) { ; CHECK-LABEL: test_mm256_shuffle_epi32: Index: llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -52,6 +52,22 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone +define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) { +; X86-LABEL: test_x86_avx2_psad_bw: +; X86: ## %bb.0: +; X86-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: test_x86_avx2_psad_bw: +; X64: ## %bb.0: +; X64-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone + + define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) { ; X86-LABEL: test_x86_avx2_movntdqa: ; X86: ## %bb.0: Index: llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -485,32 +485,6 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone -define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psad_bw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psad_bw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psad_bw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psad_bw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone - - define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) { ; X86-AVX-LABEL: test_x86_avx2_psll_d: ; X86-AVX: ## %bb.0: @@ -1330,28 +1304,28 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI53_0, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI54_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: vmovaps LCPI53_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI53_0, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI53_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI54_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI53_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) ret <16 x i16> %res @@ -2071,36 +2045,36 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X86-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI85_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI86_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] +; X86-AVX512VL-NEXT: vmovdqa LCPI85_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI86_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI85_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X64-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res @@ -2136,36 +2110,36 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI87_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI87_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI87_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI88_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX512VL-NEXT: vmovdqa LCPI87_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI88_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI87_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI87_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI87_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI87_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI87_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI87_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI87_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res Index: llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -105,6 +105,28 @@ ret <32 x i16> %res4 } +declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) + +define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){ +; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsadbw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_psadb_w_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsadbw %zmm1, %zmm0, %zmm1 +; AVX512F-32-NEXT: vpsadbw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1) + %res1 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64) define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) { Index: llvm/test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1443,28 +1443,6 @@ ret <32 x i16> %res4 } -declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) - -define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){ -; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsadbw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_mask_psadb_w_512: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vpsadbw %zmm1, %zmm0, %zmm1 -; AVX512F-32-NEXT: vpsadbw %zmm2, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: retl - %res = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1) - %res1 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 -} - declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { Index: llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2062,19 +2062,120 @@ define <2 x i64> @test_mm_sad_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_sad_epu8: ; X32: # %bb.0: -; X32-NEXT: psadbw %xmm1, %xmm0 +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: pminub %xmm1, %xmm0 +; X32-NEXT: pcmpeqb %xmm2, %xmm0 +; X32-NEXT: pcmpeqd %xmm3, %xmm3 +; X32-NEXT: pxor %xmm0, %xmm3 +; X32-NEXT: movdqa %xmm2, %xmm4 +; X32-NEXT: psubb %xmm1, %xmm4 +; X32-NEXT: psubb %xmm2, %xmm1 +; X32-NEXT: pandn %xmm1, %xmm3 +; X32-NEXT: pandn %xmm4, %xmm0 +; X32-NEXT: por %xmm3, %xmm0 +; X32-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; X32-NEXT: movdqa %xmm0, %xmm1 +; X32-NEXT: pand %xmm2, %xmm1 +; X32-NEXT: movdqa %xmm0, %xmm3 +; X32-NEXT: psrlw $8, %xmm3 +; X32-NEXT: pand %xmm2, %xmm3 +; X32-NEXT: movdqa %xmm0, %xmm4 +; X32-NEXT: psrld $16, %xmm4 +; X32-NEXT: pand %xmm2, %xmm4 +; X32-NEXT: paddq %xmm3, %xmm4 +; X32-NEXT: movdqa %xmm0, %xmm3 +; X32-NEXT: psrld $24, %xmm3 +; X32-NEXT: pand %xmm2, %xmm3 +; X32-NEXT: paddq %xmm4, %xmm3 +; X32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X32-NEXT: pand %xmm2, %xmm4 +; X32-NEXT: paddq %xmm3, %xmm4 +; X32-NEXT: movdqa %xmm0, %xmm3 +; X32-NEXT: psrlq $40, %xmm3 +; X32-NEXT: pand %xmm2, %xmm3 +; X32-NEXT: paddq %xmm4, %xmm3 +; X32-NEXT: movdqa %xmm0, %xmm4 +; X32-NEXT: psrlq $48, %xmm4 +; X32-NEXT: pand %xmm2, %xmm4 +; X32-NEXT: paddq %xmm3, %xmm4 +; X32-NEXT: psrlq $56, %xmm0 +; X32-NEXT: paddq %xmm4, %xmm0 +; X32-NEXT: paddq %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_sad_epu8: ; X64: # %bb.0: -; X64-NEXT: psadbw %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: pminub %xmm1, %xmm0 +; X64-NEXT: pcmpeqb %xmm2, %xmm0 +; X64-NEXT: pcmpeqd %xmm3, %xmm3 +; X64-NEXT: pxor %xmm0, %xmm3 +; X64-NEXT: movdqa %xmm2, %xmm4 +; X64-NEXT: psubb %xmm1, %xmm4 +; X64-NEXT: psubb %xmm2, %xmm1 +; X64-NEXT: pandn %xmm1, %xmm3 +; X64-NEXT: pandn %xmm4, %xmm0 +; X64-NEXT: por %xmm3, %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: pand %xmm2, %xmm1 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psrlw $8, %xmm3 +; X64-NEXT: pand %xmm2, %xmm3 +; X64-NEXT: movdqa %xmm0, %xmm4 +; X64-NEXT: psrld $16, %xmm4 +; X64-NEXT: pand %xmm2, %xmm4 +; X64-NEXT: paddq %xmm3, %xmm4 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psrld $24, %xmm3 +; X64-NEXT: pand %xmm2, %xmm3 +; X64-NEXT: paddq %xmm4, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X64-NEXT: pand %xmm2, %xmm4 +; X64-NEXT: paddq %xmm3, %xmm4 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psrlq $40, %xmm3 +; X64-NEXT: pand %xmm2, %xmm3 +; X64-NEXT: paddq %xmm4, %xmm3 +; X64-NEXT: movdqa %xmm0, %xmm4 +; X64-NEXT: psrlq $48, %xmm4 +; X64-NEXT: pand %xmm2, %xmm4 +; X64-NEXT: paddq %xmm3, %xmm4 +; X64-NEXT: psrlq $56, %xmm0 +; X64-NEXT: paddq %xmm4, %xmm0 +; X64-NEXT: paddq %xmm1, %xmm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %arg0, <16 x i8> %arg1) + %1 = icmp ugt <16 x i8> %arg0, %arg1 + %2 = sub <16 x i8> %arg0, %arg1 + %3 = sub <16 x i8> %arg1, %arg0 + %4 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> %3 + %5 = shufflevector <16 x i8> %4, <16 x i8> %4, <2 x i32> + %6 = zext <2 x i8> %5 to <2 x i64> + %7 = shufflevector <16 x i8> %4, <16 x i8> %4, <2 x i32> + %8 = zext <2 x i8> %7 to <2 x i64> + %9 = add <2 x i64> %6, %8 + %10 = shufflevector <16 x i8> %4, <16 x i8> %4, <2 x i32> + %11 = zext <2 x i8> %10 to <2 x i64> + %12 = add <2 x i64> %9, %11 + %13 = shufflevector <16 x i8> %4, <16 x i8> %4, <2 x i32> + %14 = zext <2 x i8> %13 to <2 x i64> + %15 = add <2 x i64> %12, %14 + %16 = shufflevector <16 x i8> %4, <16 x i8> %4, <2 x i32> + %17 = zext <2 x i8> %16 to <2 x i64> + %18 = add <2 x i64> %15, %17 + %19 = shufflevector <16 x i8> %4, <16 x i8> %4, <2 x i32> + %20 = zext <2 x i8> %19 to <2 x i64> + %21 = add <2 x i64> %18, %20 + %22 = shufflevector <16 x i8> %4, <16 x i8> %4, <2 x i32> + %23 = zext <2 x i8> %22 to <2 x i64> + %24 = add <2 x i64> %21, %23 + %25 = shufflevector <16 x i8> %4, <16 x i8> %4, <2 x i32> + %26 = zext <2 x i8> %25 to <2 x i64> + %res = add <2 x i64> %24, %26 ret <2 x i64> %res } -declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind { ; X32-LABEL: test_mm_set_epi8: Index: llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -1,6 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s +define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: test_x86_sse2_psad_bw: +; CHECK: ## %bb.0: +; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + + define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { ; CHECK-LABEL: test_x86_sse2_psll_dq_bs: ; CHECK: ## %bb.0: Index: llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1129,27 +1129,6 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone -define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_psad_bw: -; SSE: ## %bb.0: -; SSE-NEXT: psadbw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf6,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_psad_bw: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf6,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_psad_bw: -; SKX: ## %bb.0: -; SKX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf6,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone - - define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-LABEL: test_x86_sse2_psll_d: ; SSE: ## %bb.0: Index: llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll =================================================================== --- llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll +++ llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll @@ -38,20 +38,6 @@ ; CHECK: ret x86_mmx -define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory { - %c = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a, <16 x i8> %b) - ret <2 x i64> %c -} - -; CHECK-LABEL: @Test_x86_sse2_psad_bw( -; CHECK: or <16 x i8> {{.*}}, {{.*}} -; CHECK: bitcast <16 x i8> {{.*}} to <2 x i64> -; CHECK: icmp ne <2 x i64> {{.*}}, zeroinitializer -; CHECK: sext <2 x i1> {{.*}} to <2 x i64> -; CHECK: lshr <2 x i64> {{.*}}, -; CHECK: ret <2 x i64> - - define x86_mmx @Test_x86_mmx_psad_bw(x86_mmx %a, x86_mmx %b) sanitize_memory { entry: %c = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind