Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5391,10 +5391,6 @@ EVT VT = N->getValueType(0); - // FIXME - if (VT.isVector()) - return SDValue(); - // There are 3 commutable operators in the pattern, // so we have to deal with 8 possible variants of the basic pattern. SDValue X, Y, M; Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h @@ -443,9 +443,18 @@ bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; - bool hasAndNotCompare(SDValue) const override { - // 'bics' - return true; + bool hasAndNotCompare(SDValue V) const override { + // We can use bics for any scalar. + return V.getValueType().isScalarInteger(); + } + + bool hasAndNot(SDValue Y) const override { + EVT VT = Y.getValueType(); + + if (!VT.isVector()) + return hasAndNotCompare(Y); + + return VT.getSizeInBits() >= 64; // vector 'bic' } bool hasBitPreservingFPLogic(EVT VT) const override { Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -4751,26 +4751,39 @@ } bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { - // A mask and compare against constant is ok for an 'andn' too - // even though the BMI instruction doesn't have an immediate form. + EVT VT = Y.getValueType(); + + if (VT.isVector()) + return false; if (!Subtarget.hasBMI()) return false; // There are only 32-bit and 64-bit forms for 'andn'. - EVT VT = Y.getValueType(); if (VT != MVT::i32 && VT != MVT::i64) return false; + // A mask and compare against constant is ok for an 'andn' too + // even though the BMI instruction doesn't have an immediate form. + return true; } bool X86TargetLowering::hasAndNot(SDValue Y) const { - // x86 can't form 'andn' with an immediate. - if (isa(Y)) + EVT VT = Y.getValueType(); + + if (!VT.isVector()) // x86 can't form 'andn' with an immediate. + return !isa(Y) && hasAndNotCompare(Y); + + // Vector. + + if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128) return false; - return hasAndNotCompare(Y); + if (VT == MVT::v4i32) + return true; + + return Subtarget.hasSSE2(); } MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { Index: llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll +++ llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll @@ -77,9 +77,8 @@ ; CHECK-LABEL: in_constant_varx_42: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> %x, ; %x %n1 = and <4 x i32> %n0, %mask @@ -107,9 +106,8 @@ ; CHECK-LABEL: in_constant_varx_42_invmask: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: bsl v2.16b, v1.16b, v0.16b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %n0 = xor <4 x i32> %x, ; %x @@ -134,8 +132,8 @@ define <4 x i32> @in_constant_mone_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_mone_vary: ; CHECK: // %bb.0: -; CHECK-NEXT: bic v0.16b, v2.16b, v1.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: bic v0.16b, v1.16b, v2.16b +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> , %y ; %x %n1 = and <4 x i32> %n0, %mask @@ -161,9 +159,8 @@ define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_mone_vary_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v1.16b -; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v1.16b, v2.16b +; CHECK-NEXT: orn v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %n0 = xor <4 x i32> , %y ; %x @@ -189,10 +186,9 @@ define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_42_vary: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.4s, #42 -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: movi v2.4s, #42 +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> , %y ; %x %n1 = and <4 x i32> %n0, %mask @@ -219,10 +215,9 @@ define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { ; CHECK-LABEL: in_constant_42_vary_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.4s, #42 -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: movi v2.4s, #42 +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %notmask = xor <4 x i32> %mask, %n0 = xor <4 x i32> , %y ; %x Index: llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll +++ llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll @@ -270,9 +270,8 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: in_v1i8: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <1 x i8> %x, %y %n1 = and <1 x i8> %n0, %mask @@ -287,9 +286,8 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { ; CHECK-LABEL: in_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <2 x i8> %x, %y %n1 = and <2 x i8> %n0, %mask @@ -300,9 +298,8 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: in_v1i16: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <1 x i16> %x, %y %n1 = and <1 x i16> %n0, %mask @@ -317,9 +314,8 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-LABEL: in_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i8> %x, %y %n1 = and <4 x i8> %n0, %mask @@ -330,9 +326,8 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-LABEL: in_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <2 x i16> %x, %y %n1 = and <2 x i16> %n0, %mask @@ -343,9 +338,8 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: in_v1i32: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <1 x i32> %x, %y %n1 = and <1 x i32> %n0, %mask @@ -360,9 +354,8 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-LABEL: in_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <8 x i8> %x, %y %n1 = and <8 x i8> %n0, %mask @@ -373,9 +366,8 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { ; CHECK-LABEL: in_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i16> %x, %y %n1 = and <4 x i16> %n0, %mask @@ -386,9 +378,8 @@ define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-LABEL: in_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <2 x i32> %x, %y %n1 = and <2 x i32> %n0, %mask @@ -399,9 +390,8 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: in_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <1 x i64> %x, %y %n1 = and <1 x i64> %n0, %mask @@ -416,9 +406,8 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { ; CHECK-LABEL: in_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <16 x i8> %x, %y %n1 = and <16 x i8> %n0, %mask @@ -429,9 +418,8 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-LABEL: in_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <8 x i16> %x, %y %n1 = and <8 x i16> %n0, %mask @@ -442,9 +430,8 @@ define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: in_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <4 x i32> %x, %y %n1 = and <4 x i32> %n0, %mask @@ -455,9 +442,8 @@ define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-LABEL: in_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %n0 = xor <2 x i64> %x, %y %n1 = and <2 x i64> %n0, %mask Index: llvm/trunk/test/CodeGen/X86/machine-cp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/machine-cp.ll +++ llvm/trunk/test/CodeGen/X86/machine-cp.ll @@ -101,54 +101,64 @@ define <16 x float> @foo(<16 x float> %x) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb -; CHECK-NEXT: xorps %xmm8, %xmm8 -; CHECK-NEXT: cvttps2dq %xmm3, %xmm9 -; CHECK-NEXT: movaps %xmm3, %xmm13 -; CHECK-NEXT: cmpltps %xmm8, %xmm13 -; CHECK-NEXT: movaps {{.*#+}} xmm7 = [1,1,1,1] -; CHECK-NEXT: movaps %xmm13, %xmm3 -; CHECK-NEXT: andps %xmm7, %xmm3 -; CHECK-NEXT: cvttps2dq %xmm2, %xmm10 -; CHECK-NEXT: movaps %xmm2, %xmm5 -; CHECK-NEXT: cmpltps %xmm8, %xmm5 -; CHECK-NEXT: movaps %xmm5, %xmm2 -; CHECK-NEXT: andps %xmm7, %xmm2 -; CHECK-NEXT: cvttps2dq %xmm1, %xmm11 +; CHECK-NEXT: movaps %xmm3, %xmm9 +; CHECK-NEXT: movaps %xmm2, %xmm8 +; CHECK-NEXT: movaps %xmm1, %xmm6 +; CHECK-NEXT: movaps %xmm0, %xmm7 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm3, %xmm1 +; CHECK-NEXT: cmpltps %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm1, %xmm4 -; CHECK-NEXT: cmpltps %xmm8, %xmm4 -; CHECK-NEXT: movaps %xmm4, %xmm1 -; CHECK-NEXT: andps %xmm7, %xmm1 -; CHECK-NEXT: cvttps2dq %xmm0, %xmm12 -; CHECK-NEXT: movaps %xmm0, %xmm6 -; CHECK-NEXT: cmpltps %xmm8, %xmm6 -; CHECK-NEXT: andps %xmm6, %xmm7 -; CHECK-NEXT: orps {{.*}}(%rip), %xmm6 -; CHECK-NEXT: movaps {{.*#+}} xmm14 = [5,6,7,8] -; CHECK-NEXT: orps %xmm14, %xmm4 -; CHECK-NEXT: movaps {{.*#+}} xmm15 = [9,10,11,12] -; CHECK-NEXT: orps %xmm15, %xmm5 -; CHECK-NEXT: movaps {{.*#+}} xmm8 = [13,14,15,16] -; CHECK-NEXT: orps %xmm8, %xmm13 -; CHECK-NEXT: cvtdq2ps %xmm12, %xmm0 -; CHECK-NEXT: cvtdq2ps %xmm11, %xmm11 -; CHECK-NEXT: cvtdq2ps %xmm10, %xmm10 -; CHECK-NEXT: cvtdq2ps %xmm9, %xmm9 -; CHECK-NEXT: andps %xmm8, %xmm9 -; CHECK-NEXT: andps %xmm15, %xmm10 -; CHECK-NEXT: andps %xmm14, %xmm11 -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: xorps %xmm7, %xmm0 +; CHECK-NEXT: orps {{.*}}(%rip), %xmm4 +; CHECK-NEXT: movaps %xmm4, %xmm10 +; CHECK-NEXT: andnps %xmm1, %xmm10 +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: cmpltps %xmm0, %xmm1 +; CHECK-NEXT: movaps {{.*#+}} xmm11 = [9,10,11,12] +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: orps %xmm11, %xmm3 +; CHECK-NEXT: movaps %xmm3, %xmm14 +; CHECK-NEXT: andnps %xmm1, %xmm14 +; CHECK-NEXT: cvttps2dq %xmm6, %xmm12 +; CHECK-NEXT: cmpltps %xmm0, %xmm6 +; CHECK-NEXT: movaps {{.*#+}} xmm13 = [5,6,7,8] +; CHECK-NEXT: movaps %xmm6, %xmm2 +; CHECK-NEXT: orps %xmm13, %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm5 +; CHECK-NEXT: andnps %xmm6, %xmm5 +; CHECK-NEXT: cvttps2dq %xmm7, %xmm6 +; CHECK-NEXT: cmpltps %xmm0, %xmm7 +; CHECK-NEXT: movaps {{.*#+}} xmm15 = [1,2,3,4] +; CHECK-NEXT: movaps %xmm7, %xmm0 +; CHECK-NEXT: orps %xmm15, %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: andnps %xmm7, %xmm1 +; CHECK-NEXT: andps %xmm15, %xmm0 +; CHECK-NEXT: cvtdq2ps %xmm6, %xmm6 ; CHECK-NEXT: andps %xmm6, %xmm0 -; CHECK-NEXT: xorps %xmm1, %xmm11 -; CHECK-NEXT: andps %xmm4, %xmm11 -; CHECK-NEXT: xorps %xmm2, %xmm10 -; CHECK-NEXT: andps %xmm5, %xmm10 -; CHECK-NEXT: xorps %xmm3, %xmm9 -; CHECK-NEXT: andps %xmm13, %xmm9 -; CHECK-NEXT: xorps %xmm7, %xmm0 -; CHECK-NEXT: xorps %xmm11, %xmm1 -; CHECK-NEXT: xorps %xmm10, %xmm2 -; CHECK-NEXT: xorps %xmm9, %xmm3 +; CHECK-NEXT: movaps {{.*#+}} xmm6 = [1,1,1,1] +; CHECK-NEXT: andps %xmm6, %xmm1 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: andps %xmm13, %xmm2 +; CHECK-NEXT: cvtdq2ps %xmm12, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm2 +; CHECK-NEXT: andps %xmm6, %xmm5 +; CHECK-NEXT: orps %xmm5, %xmm2 +; CHECK-NEXT: andps %xmm11, %xmm3 +; CHECK-NEXT: cvttps2dq %xmm8, %xmm1 +; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm3 +; CHECK-NEXT: andps %xmm6, %xmm14 +; CHECK-NEXT: orps %xmm14, %xmm3 +; CHECK-NEXT: andps %xmm6, %xmm10 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm4 +; CHECK-NEXT: cvttps2dq %xmm9, %xmm1 +; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1 +; CHECK-NEXT: andps %xmm1, %xmm4 +; CHECK-NEXT: orps %xmm10, %xmm4 +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: movaps %xmm4, %xmm3 ; CHECK-NEXT: retq bb: %v3 = icmp slt <16 x i32> , zeroinitializer Index: llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -58,18 +58,20 @@ ; ; CHECK-SSE2-LABEL: in_constant_varx_mone: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0 +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pandn (%rdx), %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pand (%rdi), %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_constant_varx_mone: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 ; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vpandn (%rdx), %xmm0, %xmm0 -; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; CHECK-XOP-NEXT: vpand (%rdi), %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 %y = load <4 x i32>, <4 x i32> *%py, align 16 @@ -132,21 +134,22 @@ ; ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm2 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm0, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn (%rdi), %xmm1 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_constant_varx_mone_invmask: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 ; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm2 -; CHECK-XOP-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpandn (%rdi), %xmm0, %xmm2 +; CHECK-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 %y = load <4 x i32>, <4 x i32> *%py, align 16 @@ -198,30 +201,29 @@ define <4 x i32> @in_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { ; CHECK-SSE1-LABEL: in_constant_varx_42: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44] +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andps (%rcx), %xmm1 -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps {{.*}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_varx_42: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm1 = [42,42,42,42] -; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 -; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps (%rdi), %xmm1 +; CHECK-SSE2-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_constant_varx_42: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42] -; CHECK-XOP-NEXT: vxorps (%rdi), %xmm0, %xmm1 -; CHECK-XOP-NEXT: vandps (%rdx), %xmm1, %xmm1 -; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1 +; CHECK-XOP-NEXT: vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 %y = load <4 x i32>, <4 x i32> *%py, align 16 @@ -275,11 +277,10 @@ ; CHECK-SSE1-LABEL: in_constant_varx_42_invmask: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44] -; CHECK-SSE1-NEXT: movaps (%rsi), %xmm2 -; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 -; CHECK-SSE1-NEXT: andnps %xmm2, %xmm0 -; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: retq @@ -287,20 +288,17 @@ ; CHECK-SSE2-LABEL: in_constant_varx_42_invmask: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm1 = [42,42,42,42] -; CHECK-SSE2-NEXT: movaps (%rdi), %xmm2 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm2 -; CHECK-SSE2-NEXT: andnps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps (%rdi), %xmm1 +; CHECK-SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_constant_varx_42_invmask: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-XOP-NEXT: vmovaps {{.*#+}} xmm1 = [42,42,42,42] -; CHECK-XOP-NEXT: vxorps (%rdi), %xmm1, %xmm2 -; CHECK-XOP-NEXT: vandnps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 +; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-XOP-NEXT: vpcmov %xmm0, (%rdi), %xmm1, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 %y = load <4 x i32>, <4 x i32> *%py, align 16 @@ -350,27 +348,27 @@ define <4 x i32> @in_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { ; CHECK-SSE1-LABEL: in_constant_mone_vary: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps (%rcx), %xmm1 -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 +; CHECK-SSE1-NEXT: orps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_mone_vary: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm1 ; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE2-NEXT: andnps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_constant_mone_vary: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 -; CHECK-XOP-NEXT: vandnps (%rdx), %xmm0, %xmm1 -; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-XOP-NEXT: vandnps (%rsi), %xmm0, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 %y = load <4 x i32>, <4 x i32> *%py, align 16 @@ -425,33 +423,31 @@ define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { ; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm1 -; CHECK-SSE1-NEXT: xorps {{.*}}(%rip), %xmm1 -; CHECK-SSE1-NEXT: movaps %xmm0, %xmm2 -; CHECK-SSE1-NEXT: andnps %xmm1, %xmm2 -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm2 -; CHECK-SSE1-NEXT: movaps %xmm2, (%rdi) +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [nan,nan,nan,nan] +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa (%rsi), %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-SSE2-NEXT: pxor (%rdx), %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pand (%rsi), %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_constant_mone_vary_invmask: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovdqa (%rsi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 ; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm1 -; CHECK-XOP-NEXT: vpandn %xmm1, %xmm0, %xmm1 -; CHECK-XOP-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; CHECK-XOP-NEXT: vpand (%rsi), %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 %y = load <4 x i32>, <4 x i32> *%py, align 16 @@ -503,30 +499,29 @@ define <4 x i32> @in_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { ; CHECK-SSE1-LABEL: in_constant_42_vary: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44] -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andps (%rcx), %xmm1 -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 +; CHECK-SSE1-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_42_vary: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1 -; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42] -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 -; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm1 +; CHECK-SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_constant_42_vary: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-XOP-NEXT: vandps (%rdx), %xmm1, %xmm1 -; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 +; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-XOP-NEXT: vpcmov %xmm0, (%rsi), %xmm1, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 %y = load <4 x i32>, <4 x i32> *%py, align 16 @@ -579,33 +574,29 @@ define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { ; CHECK-SSE1-LABEL: in_constant_42_vary_invmask: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm1 -; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm2 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44] -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm2 -; CHECK-SSE1-NEXT: andnps %xmm2, %xmm1 -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps (%rdx), %xmm1 +; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps {{.*}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_42_vary_invmask: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1 ; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm2 = [42,42,42,42] -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm2 -; CHECK-SSE2-NEXT: andnps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE2-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_constant_42_vary_invmask: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 -; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm1 -; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-XOP-NEXT: vandnps %xmm2, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rsi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1 +; CHECK-XOP-NEXT: vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 %y = load <4 x i32>, <4 x i32> *%py, align 16 Index: llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -2607,16 +2607,14 @@ ; ; CHECK-SSE2-LABEL: in_v2i8: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v2i8: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %n0 = xor <2 x i8> %x, %y %n1 = and <2 x i8> %n0, %mask @@ -2693,16 +2691,14 @@ ; ; CHECK-SSE2-LABEL: in_v4i8: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v4i8: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %n0 = xor <4 x i8> %x, %y %n1 = and <4 x i8> %n0, %mask @@ -2737,16 +2733,14 @@ ; ; CHECK-SSE2-LABEL: in_v2i16: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v2i16: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %n0 = xor <2 x i16> %x, %y %n1 = and <2 x i16> %n0, %mask @@ -2895,16 +2889,14 @@ ; ; CHECK-SSE2-LABEL: in_v8i8: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v8i8: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %n0 = xor <8 x i8> %x, %y %n1 = and <8 x i8> %n0, %mask @@ -2963,16 +2955,14 @@ ; ; CHECK-SSE2-LABEL: in_v4i16: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v4i16: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %n0 = xor <4 x i16> %x, %y %n1 = and <4 x i16> %n0, %mask @@ -3007,16 +2997,14 @@ ; ; CHECK-SSE2-LABEL: in_v2i32: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v2i32: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %n0 = xor <2 x i32> %x, %y %n1 = and <2 x i32> %n0, %mask @@ -3273,16 +3261,14 @@ ; ; CHECK-SSE2-LABEL: in_v16i8: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v16i8: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %n0 = xor <16 x i8> %x, %y %n1 = and <16 x i8> %n0, %mask @@ -3401,16 +3387,14 @@ ; ; CHECK-SSE2-LABEL: in_v8i16: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v8i16: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %n0 = xor <8 x i16> %x, %y %n1 = and <8 x i16> %n0, %mask @@ -3452,30 +3436,29 @@ ; ; CHECK-SSE1-LABEL: in_v4i32: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andps (%rcx), %xmm1 -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 +; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_v4i32: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1 -; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 -; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm1 +; CHECK-SSE2-NEXT: andps (%rdi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v4i32: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 -; CHECK-XOP-NEXT: vxorps (%rdi), %xmm0, %xmm1 -; CHECK-XOP-NEXT: vandps (%rdx), %xmm1, %xmm1 -; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1 +; CHECK-XOP-NEXT: vpcmov %xmm1, (%rsi), %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 %y = load <4 x i32>, <4 x i32> *%py, align 16 @@ -3513,16 +3496,14 @@ ; ; CHECK-SSE2-LABEL: in_v2i64: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v2i64: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 -; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %n0 = xor <2 x i64> %x, %y %n1 = and <2 x i64> %n0, %mask @@ -4067,24 +4048,23 @@ ; ; CHECK-SSE2-LABEL: in_v32i8: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps (%rsi), %xmm2 -; CHECK-SSE2-NEXT: movaps 16(%rsi), %xmm3 -; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: movaps 16(%rdi), %xmm1 -; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1 -; CHECK-SSE2-NEXT: andps 16(%rdx), %xmm1 -; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1 +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2 +; CHECK-SSE2-NEXT: andps (%rdi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2 +; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v32i8: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %ymm0 -; CHECK-XOP-NEXT: vxorps (%rdi), %ymm0, %ymm1 -; CHECK-XOP-NEXT: vandps (%rdx), %ymm1, %ymm1 -; CHECK-XOP-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; CHECK-XOP-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %ymm1 +; CHECK-XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0 ; CHECK-XOP-NEXT: retq %x = load <32 x i8>, <32 x i8> *%px, align 32 %y = load <32 x i8>, <32 x i8> *%py, align 32 @@ -4402,24 +4382,23 @@ ; ; CHECK-SSE2-LABEL: in_v16i16: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps (%rsi), %xmm2 -; CHECK-SSE2-NEXT: movaps 16(%rsi), %xmm3 -; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: movaps 16(%rdi), %xmm1 -; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1 -; CHECK-SSE2-NEXT: andps 16(%rdx), %xmm1 -; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1 +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2 +; CHECK-SSE2-NEXT: andps (%rdi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2 +; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v16i16: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %ymm0 -; CHECK-XOP-NEXT: vxorps (%rdi), %ymm0, %ymm1 -; CHECK-XOP-NEXT: vandps (%rdx), %ymm1, %ymm1 -; CHECK-XOP-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; CHECK-XOP-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %ymm1 +; CHECK-XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0 ; CHECK-XOP-NEXT: retq %x = load <16 x i16>, <16 x i16> *%px, align 32 %y = load <16 x i16>, <16 x i16> *%py, align 32 @@ -4571,24 +4550,23 @@ ; ; CHECK-SSE2-LABEL: in_v8i32: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps (%rsi), %xmm2 -; CHECK-SSE2-NEXT: movaps 16(%rsi), %xmm3 -; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: movaps 16(%rdi), %xmm1 -; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1 -; CHECK-SSE2-NEXT: andps 16(%rdx), %xmm1 -; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1 +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2 +; CHECK-SSE2-NEXT: andps (%rdi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2 +; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v8i32: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %ymm0 -; CHECK-XOP-NEXT: vxorps (%rdi), %ymm0, %ymm1 -; CHECK-XOP-NEXT: vandps (%rdx), %ymm1, %ymm1 -; CHECK-XOP-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; CHECK-XOP-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %ymm1 +; CHECK-XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0 ; CHECK-XOP-NEXT: retq %x = load <8 x i32>, <8 x i32> *%px, align 32 %y = load <8 x i32>, <8 x i32> *%py, align 32 @@ -4664,24 +4642,23 @@ ; ; CHECK-SSE2-LABEL: in_v4i64: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps (%rsi), %xmm2 -; CHECK-SSE2-NEXT: movaps 16(%rsi), %xmm3 -; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: movaps 16(%rdi), %xmm1 -; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1 -; CHECK-SSE2-NEXT: andps 16(%rdx), %xmm1 -; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1 +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2 +; CHECK-SSE2-NEXT: andps (%rdi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2 +; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v4i64: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %ymm0 -; CHECK-XOP-NEXT: vxorps (%rdi), %ymm0, %ymm1 -; CHECK-XOP-NEXT: vandps (%rdx), %ymm1, %ymm1 -; CHECK-XOP-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; CHECK-XOP-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %ymm1 +; CHECK-XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i64>, <4 x i64> *%px, align 32 %y = load <4 x i64>, <4 x i64> *%py, align 32