Index: test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; ============================================================================ ; +; Various cases with %x and/or %y being a constant +; ============================================================================ ; + +define <4 x i32> @out_constant_varx_mone(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: out_constant_varx_mone: +; CHECK: // %bb.0: +; CHECK-NEXT: and v0.16b, v2.16b, v0.16b +; CHECK-NEXT: orn v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %mask, %x + %my = and <4 x i32> %notmask, + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @in_constant_varx_mone(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: in_constant_varx_mone: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.16b, v2.16b, v0.16b +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: ret + %n0 = xor <4 x i32> %x, ; %x + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @out_constant_varx_mone_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: out_constant_varx_mone_invmask: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %notmask, %x + %my = and <4 x i32> %mask, + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: in_constant_varx_mone_invmask: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %n0 = xor <4 x i32> %x, ; %x + %n1 = and <4 x i32> %n0, %notmask + %r = xor <4 x i32> %n1, + ret <4 x i32> %r +} + +define <4 x i32> @out_constant_varx_42(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: out_constant_varx_42: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4s, #42 +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %mask, %x + %my = and <4 x i32> %notmask, + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @in_constant_varx_42(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: in_constant_varx_42: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4s, #42 +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %n0 = xor <4 x i32> %x, ; %x + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @out_constant_varx_42_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: out_constant_varx_42_invmask: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4s, #42 +; CHECK-NEXT: bsl v2.16b, v1.16b, v0.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %notmask, %x + %my = and <4 x i32> %mask, + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @in_constant_varx_42_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: in_constant_varx_42_invmask: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4s, #42 +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %n0 = xor <4 x i32> %x, ; %x + %n1 = and <4 x i32> %n0, %notmask + %r = xor <4 x i32> %n1, + ret <4 x i32> %r +} + +define <4 x i32> @out_constant_mone_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: out_constant_mone_vary: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.16b, v1.16b, v2.16b +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %mask, + %my = and <4 x i32> %notmask, %y + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @in_constant_mone_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: in_constant_mone_vary: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.16b, v2.16b, v1.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %n0 = xor <4 x i32> , %y ; %x + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @out_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: out_constant_mone_vary_invmask: +; CHECK: // %bb.0: +; CHECK-NEXT: and v0.16b, v2.16b, v1.16b +; CHECK-NEXT: orn v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %notmask, + %my = and <4 x i32> %mask, %y + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: in_constant_mone_vary_invmask: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn v0.16b, v1.16b +; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %n0 = xor <4 x i32> , %y ; %x + %n1 = and <4 x i32> %n0, %notmask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} + +define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: out_constant_42_vary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: movi v2.4s, #42 +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %mask, + %my = and <4 x i32> %notmask, %y + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: in_constant_42_vary: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %n0 = xor <4 x i32> , %y ; %x + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: out_constant_42_vary_invmask: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: movi v2.4s, #42 +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %notmask, + %my = and <4 x i32> %mask, %y + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) { +; CHECK-LABEL: in_constant_42_vary_invmask: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.4s, #42 +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b +; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %notmask = xor <4 x i32> %mask, + %n0 = xor <4 x i32> , %y ; %x + %n1 = and <4 x i32> %n0, %notmask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} Index: test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll @@ -0,0 +1,466 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; https://bugs.llvm.org/show_bug.cgi?id=37104 + +; All the advanced stuff (negative tests, commutativity) is handled in the +; scalar version of the test only. + +; ============================================================================ ; +; 8-bit vector width +; ============================================================================ ; + +define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { +; CHECK-LABEL: out_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <1 x i8> %x, %mask + %notmask = xor <1 x i8> %mask, + %my = and <1 x i8> %y, %notmask + %r = or <1 x i8> %mx, %my + ret <1 x i8> %r +} + +; ============================================================================ ; +; 16-bit vector width +; ============================================================================ ; + +define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { +; CHECK-LABEL: out_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: movi d3, #0x0000ff000000ff +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b +; CHECK-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %mx = and <2 x i8> %x, %mask + %notmask = xor <2 x i8> %mask, + %my = and <2 x i8> %y, %notmask + %r = or <2 x i8> %mx, %my + ret <2 x i8> %r +} + +define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { +; CHECK-LABEL: out_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <1 x i16> %x, %mask + %notmask = xor <1 x i16> %mask, + %my = and <1 x i16> %y, %notmask + %r = or <1 x i16> %mx, %my + ret <1 x i16> %r +} + +; ============================================================================ ; +; 32-bit vector width +; ============================================================================ ; + +define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { +; CHECK-LABEL: out_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: movi d3, #0xff00ff00ff00ff +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b +; CHECK-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %mx = and <4 x i8> %x, %mask + %notmask = xor <4 x i8> %mask, + %my = and <4 x i8> %y, %notmask + %r = or <4 x i8> %mx, %my + ret <4 x i8> %r +} + +define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { +; CHECK-LABEL: out_v4i8_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: movi d3, #0xff00ff00ff00ff +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b +; CHECK-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %mx = and <4 x i8> %x, %mask + %notmask = xor <4 x i8> %mask, + %my = and <4 x i8> %y, %notmask + %r = or <4 x i8> %mx, %my + ret <4 x i8> %r +} + +define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { +; CHECK-LABEL: out_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: movi d3, #0x00ffff0000ffff +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b +; CHECK-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %mx = and <2 x i16> %x, %mask + %notmask = xor <2 x i16> %mask, + %my = and <2 x i16> %y, %notmask + %r = or <2 x i16> %mx, %my + ret <2 x i16> %r +} + +define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { +; CHECK-LABEL: out_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <1 x i32> %x, %mask + %notmask = xor <1 x i32> %mask, + %my = and <1 x i32> %y, %notmask + %r = or <1 x i32> %mx, %my + ret <1 x i32> %r +} + +; ============================================================================ ; +; 64-bit vector width +; ============================================================================ ; + +define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { +; CHECK-LABEL: out_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <8 x i8> %x, %mask + %notmask = xor <8 x i8> %mask, + %my = and <8 x i8> %y, %notmask + %r = or <8 x i8> %mx, %my + ret <8 x i8> %r +} + +define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { +; CHECK-LABEL: out_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <4 x i16> %x, %mask + %notmask = xor <4 x i16> %mask, + %my = and <4 x i16> %y, %notmask + %r = or <4 x i16> %mx, %my + ret <4 x i16> %r +} + +define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { +; CHECK-LABEL: out_v4i16_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <4 x i16> %x, %mask + %notmask = xor <4 x i16> %mask, + %my = and <4 x i16> %y, %notmask + %r = or <4 x i16> %mx, %my + ret <4 x i16> %r +} + +define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { +; CHECK-LABEL: out_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <2 x i32> %x, %mask + %notmask = xor <2 x i32> %mask, + %my = and <2 x i32> %y, %notmask + %r = or <2 x i32> %mx, %my + ret <2 x i32> %r +} + +define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { +; CHECK-LABEL: out_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <1 x i64> %x, %mask + %notmask = xor <1 x i64> %mask, + %my = and <1 x i64> %y, %notmask + %r = or <1 x i64> %mx, %my + ret <1 x i64> %r +} + +; ============================================================================ ; +; 128-bit vector width +; ============================================================================ ; + +define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { +; CHECK-LABEL: out_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <16 x i8> %x, %mask + %notmask = xor <16 x i8> %mask, + %my = and <16 x i8> %y, %notmask + %r = or <16 x i8> %mx, %my + ret <16 x i8> %r +} + +define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { +; CHECK-LABEL: out_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <8 x i16> %x, %mask + %notmask = xor <8 x i16> %mask, + %my = and <8 x i16> %y, %notmask + %r = or <8 x i16> %mx, %my + ret <8 x i16> %r +} + +define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { +; CHECK-LABEL: out_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <4 x i32> %x, %mask + %notmask = xor <4 x i32> %mask, + %my = and <4 x i32> %y, %notmask + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { +; CHECK-LABEL: out_v4i32_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <4 x i32> %x, %mask + %notmask = xor <4 x i32> %mask, + %my = and <4 x i32> %y, %notmask + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { +; CHECK-LABEL: out_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret + %mx = and <2 x i64> %x, %mask + %notmask = xor <2 x i64> %mask, + %my = and <2 x i64> %y, %notmask + %r = or <2 x i64> %mx, %my + ret <2 x i64> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Should be the same as the previous one. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; ============================================================================ ; +; 8-bit vector width +; ============================================================================ ; + +define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { +; CHECK-LABEL: in_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <1 x i8> %x, %y + %n1 = and <1 x i8> %n0, %mask + %r = xor <1 x i8> %n1, %y + ret <1 x i8> %r +} + +; ============================================================================ ; +; 16-bit vector width +; ============================================================================ ; + +define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { +; CHECK-LABEL: in_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <2 x i8> %x, %y + %n1 = and <2 x i8> %n0, %mask + %r = xor <2 x i8> %n1, %y + ret <2 x i8> %r +} + +define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { +; CHECK-LABEL: in_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <1 x i16> %x, %y + %n1 = and <1 x i16> %n0, %mask + %r = xor <1 x i16> %n1, %y + ret <1 x i16> %r +} + +; ============================================================================ ; +; 32-bit vector width +; ============================================================================ ; + +define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { +; CHECK-LABEL: in_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <4 x i8> %x, %y + %n1 = and <4 x i8> %n0, %mask + %r = xor <4 x i8> %n1, %y + ret <4 x i8> %r +} + +define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { +; CHECK-LABEL: in_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <2 x i16> %x, %y + %n1 = and <2 x i16> %n0, %mask + %r = xor <2 x i16> %n1, %y + ret <2 x i16> %r +} + +define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { +; CHECK-LABEL: in_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <1 x i32> %x, %y + %n1 = and <1 x i32> %n0, %mask + %r = xor <1 x i32> %n1, %y + ret <1 x i32> %r +} + +; ============================================================================ ; +; 64-bit vector width +; ============================================================================ ; + +define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { +; CHECK-LABEL: in_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <8 x i8> %x, %y + %n1 = and <8 x i8> %n0, %mask + %r = xor <8 x i8> %n1, %y + ret <8 x i8> %r +} + +define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { +; CHECK-LABEL: in_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <4 x i16> %x, %y + %n1 = and <4 x i16> %n0, %mask + %r = xor <4 x i16> %n1, %y + ret <4 x i16> %r +} + +define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { +; CHECK-LABEL: in_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <2 x i32> %x, %y + %n1 = and <2 x i32> %n0, %mask + %r = xor <2 x i32> %n1, %y + ret <2 x i32> %r +} + +define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { +; CHECK-LABEL: in_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %n0 = xor <1 x i64> %x, %y + %n1 = and <1 x i64> %n0, %mask + %r = xor <1 x i64> %n1, %y + ret <1 x i64> %r +} + +; ============================================================================ ; +; 128-bit vector width +; ============================================================================ ; + +define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { +; CHECK-LABEL: in_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %n0 = xor <16 x i8> %x, %y + %n1 = and <16 x i8> %n0, %mask + %r = xor <16 x i8> %n1, %y + ret <16 x i8> %r +} + +define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { +; CHECK-LABEL: in_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %n0 = xor <8 x i16> %x, %y + %n1 = and <8 x i16> %n0, %mask + %r = xor <8 x i16> %n1, %y + ret <8 x i16> %r +} + +define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { +; CHECK-LABEL: in_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %n0 = xor <4 x i32> %x, %y + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} + +define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { +; CHECK-LABEL: in_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %n0 = xor <2 x i64> %x, %y + %n1 = and <2 x i64> %n0, %mask + %r = xor <2 x i64> %n1, %y + ret <2 x i64> %r +} Index: test/CodeGen/X86/machine-cp.ll =================================================================== --- test/CodeGen/X86/machine-cp.ll +++ test/CodeGen/X86/machine-cp.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: je LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %while.body.preheader -; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movl %esi, %edx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_3: ## %while.body ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 Index: test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -0,0 +1,618 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE1 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+xop < %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP + +; ============================================================================ ; +; Various cases with %x and/or %y being a constant +; ============================================================================ ; + +define <4 x i32> @out_constant_varx_mone(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: out_constant_varx_mone: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [nan,nan,nan,nan] +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_constant_varx_mone: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pand (%rdi), %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_constant_varx_mone: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 +; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; CHECK-XOP-NEXT: vpand (%rdi), %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %mask, %x + %my = and <4 x i32> %notmask, + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @in_constant_varx_mone(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: in_constant_varx_mone: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: andnps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_constant_varx_mone: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pandn (%rdx), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_constant_varx_mone: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vpandn (%rdx), %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %n0 = xor <4 x i32> %x, ; %x + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @out_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: out_constant_varx_mone_invmask: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: orps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_constant_varx_mone_invmask: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm1 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps (%rdi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_constant_varx_mone_invmask: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-XOP-NEXT: vandnps (%rdi), %xmm0, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %notmask, %x + %my = and <4 x i32> %mask, + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: in_constant_varx_mone_invmask: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [nan,nan,nan,nan] +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm2 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm2, %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_constant_varx_mone_invmask: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm2 +; CHECK-XOP-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %n0 = xor <4 x i32> %x, ; %x + %n1 = and <4 x i32> %n0, %notmask + %r = xor <4 x i32> %n1, + ret <4 x i32> %r +} + +define <4 x i32> @out_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: out_constant_varx_42: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps {{.*}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_constant_varx_42: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps (%rdi), %xmm1 +; CHECK-SSE2-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_constant_varx_42: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1 +; CHECK-XOP-NEXT: vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %mask, %x + %my = and <4 x i32> %notmask, + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @in_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: in_constant_varx_42: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44] +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andps (%rcx), %xmm1 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_constant_varx_42: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_constant_varx_42: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42] +; CHECK-XOP-NEXT: vxorps (%rdi), %xmm0, %xmm1 +; CHECK-XOP-NEXT: vandps (%rdx), %xmm1, %xmm1 +; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %n0 = xor <4 x i32> %x, ; %x + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @out_constant_varx_42_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: out_constant_varx_42_invmask: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_constant_varx_42_invmask: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps (%rdi), %xmm1 +; CHECK-SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_constant_varx_42_invmask: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 +; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-XOP-NEXT: vpcmov %xmm0, (%rdi), %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %notmask, %x + %my = and <4 x i32> %mask, + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @in_constant_varx_42_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: in_constant_varx_42_invmask: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44] +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm2 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm2, %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_constant_varx_42_invmask: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-SSE2-NEXT: movaps (%rdi), %xmm2 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_constant_varx_42_invmask: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-XOP-NEXT: vmovaps {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-XOP-NEXT: vxorps (%rdi), %xmm1, %xmm2 +; CHECK-XOP-NEXT: vandnps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %n0 = xor <4 x i32> %x, ; %x + %n1 = and <4 x i32> %n0, %notmask + %r = xor <4 x i32> %n1, + ret <4 x i32> %r +} + +define <4 x i32> @out_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: out_constant_mone_vary: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 +; CHECK-SSE1-NEXT: orps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_constant_mone_vary: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm1 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_constant_mone_vary: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-XOP-NEXT: vandnps (%rsi), %xmm0, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %mask, + %my = and <4 x i32> %notmask, %y + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @in_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: in_constant_mone_vary: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rcx), %xmm1 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_constant_mone_vary: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andnps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_constant_mone_vary: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 +; CHECK-XOP-NEXT: vandnps (%rdx), %xmm0, %xmm1 +; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %n0 = xor <4 x i32> , %y ; %x + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @out_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: out_constant_mone_vary_invmask: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [nan,nan,nan,nan] +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_constant_mone_vary_invmask: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pand (%rsi), %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_constant_mone_vary_invmask: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 +; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; CHECK-XOP-NEXT: vpand (%rsi), %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %notmask, + %my = and <4 x i32> %mask, %y + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm1 +; CHECK-SSE1-NEXT: xorps {{.*}}(%rip), %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm0, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm2 +; CHECK-SSE1-NEXT: movaps %xmm2, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa (%rsi), %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pxor (%rdx), %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_constant_mone_vary_invmask: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rsi), %xmm0 +; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm1 +; CHECK-XOP-NEXT: vpandn %xmm1, %xmm0, %xmm1 +; CHECK-XOP-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %n0 = xor <4 x i32> , %y ; %x + %n1 = and <4 x i32> %n0, %notmask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} + +define <4 x i32> @out_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: out_constant_42_vary: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44] +; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_constant_42_vary: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-SSE2-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_constant_42_vary: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 +; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42] +; CHECK-XOP-NEXT: vpcmov %xmm0, (%rsi), %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %mask, + %my = and <4 x i32> %notmask, %y + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @in_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: in_constant_42_vary: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44] +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andps (%rcx), %xmm1 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_constant_42_vary: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42] +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_constant_42_vary: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-XOP-NEXT: vandps (%rdx), %xmm1, %xmm1 +; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %n0 = xor <4 x i32> , %y ; %x + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: out_constant_42_vary_invmask: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps {{.*}}(%rip), %xmm1 +; CHECK-SSE1-NEXT: andps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_constant_42_vary_invmask: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: andps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_constant_42_vary_invmask: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rsi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1 +; CHECK-XOP-NEXT: vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %mx = and <4 x i32> %notmask, + %my = and <4 x i32> %mask, %y + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +; This is not a canonical form. Testing for completeness only. +define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) { +; CHECK-SSE1-LABEL: in_constant_42_vary_invmask: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm1 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm2 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44] +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm2, %xmm1 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_constant_42_vary_invmask: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm2 = [42,42,42,42] +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: andnps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_constant_42_vary_invmask: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 +; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm1 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-XOP-NEXT: vandnps %xmm2, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %notmask = xor <4 x i32> %mask, + %n0 = xor <4 x i32> , %y ; %x + %n1 = and <4 x i32> %n0, %notmask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} Index: test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -0,0 +1,2309 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BASELINE +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE1 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE,CHECK-SSE2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+xop < %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP + +; https://bugs.llvm.org/show_bug.cgi?id=37104 + +; All the advanced stuff (negative tests, commutativity) is handled in the +; scalar version of the test only. + +; ============================================================================ ; +; 8-bit vector width +; ============================================================================ ; + +define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { +; CHECK-LABEL: out_v1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andl %edx, %edi +; CHECK-NEXT: notb %dl +; CHECK-NEXT: andb %sil, %dl +; CHECK-NEXT: orb %dil, %dl +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: retq + %mx = and <1 x i8> %x, %mask + %notmask = xor <1 x i8> %mask, + %my = and <1 x i8> %y, %notmask + %r = or <1 x i8> %mx, %my + ret <1 x i8> %r +} + +; ============================================================================ ; +; 16-bit vector width +; ============================================================================ ; + +define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v2i8: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: andl %r8d, %edi +; CHECK-BASELINE-NEXT: andl %r9d, %esi +; CHECK-BASELINE-NEXT: notb %r8b +; CHECK-BASELINE-NEXT: notb %r9b +; CHECK-BASELINE-NEXT: andb %cl, %r9b +; CHECK-BASELINE-NEXT: andb %dl, %r8b +; CHECK-BASELINE-NEXT: orb %dil, %r8b +; CHECK-BASELINE-NEXT: orb %sil, %r9b +; CHECK-BASELINE-NEXT: movl %r8d, %eax +; CHECK-BASELINE-NEXT: movl %r9d, %edx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v2i8: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: andl %r8d, %edi +; CHECK-SSE1-NEXT: andl %r9d, %esi +; CHECK-SSE1-NEXT: notb %r8b +; CHECK-SSE1-NEXT: notb %r9b +; CHECK-SSE1-NEXT: andb %cl, %r9b +; CHECK-SSE1-NEXT: andb %dl, %r8b +; CHECK-SSE1-NEXT: orb %dil, %r8b +; CHECK-SSE1-NEXT: orb %sil, %r9b +; CHECK-SSE1-NEXT: movl %r8d, %eax +; CHECK-SSE1-NEXT: movl %r9d, %edx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v2i8: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v2i8: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <2 x i8> %x, %mask + %notmask = xor <2 x i8> %mask, + %my = and <2 x i8> %y, %notmask + %r = or <2 x i8> %mx, %my + ret <2 x i8> %r +} + +define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { +; CHECK-LABEL: out_v1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: andl %edx, %edi +; CHECK-NEXT: notl %edx +; CHECK-NEXT: andl %esi, %edx +; CHECK-NEXT: orl %edi, %edx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: retq + %mx = and <1 x i16> %x, %mask + %notmask = xor <1 x i16> %mask, + %my = and <1 x i16> %y, %notmask + %r = or <1 x i16> %mx, %my + ret <1 x i16> %r +} + +; ============================================================================ ; +; 32-bit vector width +; ============================================================================ ; + +define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v4i8: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: andb %bl, %r8b +; CHECK-BASELINE-NEXT: andb %al, %cl +; CHECK-BASELINE-NEXT: andb %r11b, %dl +; CHECK-BASELINE-NEXT: andb %r10b, %sil +; CHECK-BASELINE-NEXT: notb %r11b +; CHECK-BASELINE-NEXT: notb %al +; CHECK-BASELINE-NEXT: notb %bl +; CHECK-BASELINE-NEXT: notb %r10b +; CHECK-BASELINE-NEXT: andb %r9b, %r10b +; CHECK-BASELINE-NEXT: orb %sil, %r10b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: orb %r8b, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: orb %cl, %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: orb %dl, %r11b +; CHECK-BASELINE-NEXT: movb %bl, 3(%rdi) +; CHECK-BASELINE-NEXT: movb %al, 2(%rdi) +; CHECK-BASELINE-NEXT: movb %r11b, 1(%rdi) +; CHECK-BASELINE-NEXT: movb %r10b, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v4i8: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: pushq %rbx +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: andb %bl, %r8b +; CHECK-SSE1-NEXT: andb %al, %cl +; CHECK-SSE1-NEXT: andb %r11b, %dl +; CHECK-SSE1-NEXT: andb %r10b, %sil +; CHECK-SSE1-NEXT: notb %r11b +; CHECK-SSE1-NEXT: notb %al +; CHECK-SSE1-NEXT: notb %bl +; CHECK-SSE1-NEXT: notb %r10b +; CHECK-SSE1-NEXT: andb %r9b, %r10b +; CHECK-SSE1-NEXT: orb %sil, %r10b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: orb %r8b, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: orb %cl, %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: orb %dl, %r11b +; CHECK-SSE1-NEXT: movb %bl, 3(%rdi) +; CHECK-SSE1-NEXT: movb %al, 2(%rdi) +; CHECK-SSE1-NEXT: movb %r11b, 1(%rdi) +; CHECK-SSE1-NEXT: movb %r10b, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v4i8: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v4i8: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <4 x i8> %x, %mask + %notmask = xor <4 x i8> %mask, + %my = and <4 x i8> %y, %notmask + %r = or <4 x i8> %mx, %my + ret <4 x i8> %r +} + +define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v4i8_undef: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: andb %al, %r8b +; CHECK-BASELINE-NEXT: andb %r11b, %dl +; CHECK-BASELINE-NEXT: andb %r10b, %sil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: notb %r11b +; CHECK-BASELINE-NEXT: notb %al +; CHECK-BASELINE-NEXT: notb %r10b +; CHECK-BASELINE-NEXT: andb %r9b, %r10b +; CHECK-BASELINE-NEXT: orb %sil, %r10b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: orb %r8b, %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: orb %dl, %r11b +; CHECK-BASELINE-NEXT: movb %cl, 2(%rdi) +; CHECK-BASELINE-NEXT: movb %al, 3(%rdi) +; CHECK-BASELINE-NEXT: movb %r11b, 1(%rdi) +; CHECK-BASELINE-NEXT: movb %r10b, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v4i8_undef: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: andb %al, %r8b +; CHECK-SSE1-NEXT: andb %r11b, %dl +; CHECK-SSE1-NEXT: andb %r10b, %sil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: notb %r11b +; CHECK-SSE1-NEXT: notb %al +; CHECK-SSE1-NEXT: notb %r10b +; CHECK-SSE1-NEXT: andb %r9b, %r10b +; CHECK-SSE1-NEXT: orb %sil, %r10b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: orb %r8b, %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: orb %dl, %r11b +; CHECK-SSE1-NEXT: movb %cl, 2(%rdi) +; CHECK-SSE1-NEXT: movb %al, 3(%rdi) +; CHECK-SSE1-NEXT: movb %r11b, 1(%rdi) +; CHECK-SSE1-NEXT: movb %r10b, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v4i8_undef: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v4i8_undef: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <4 x i8> %x, %mask + %notmask = xor <4 x i8> %mask, + %my = and <4 x i8> %y, %notmask + %r = or <4 x i8> %mx, %my + ret <4 x i8> %r +} + +define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v2i16: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: andl %r9d, %esi +; CHECK-BASELINE-NEXT: andl %r8d, %edi +; CHECK-BASELINE-NEXT: notl %r8d +; CHECK-BASELINE-NEXT: notl %r9d +; CHECK-BASELINE-NEXT: andl %ecx, %r9d +; CHECK-BASELINE-NEXT: orl %esi, %r9d +; CHECK-BASELINE-NEXT: andl %edx, %r8d +; CHECK-BASELINE-NEXT: orl %edi, %r8d +; CHECK-BASELINE-NEXT: movl %r8d, %eax +; CHECK-BASELINE-NEXT: movl %r9d, %edx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v2i16: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: andl %r9d, %esi +; CHECK-SSE1-NEXT: andl %r8d, %edi +; CHECK-SSE1-NEXT: notl %r8d +; CHECK-SSE1-NEXT: notl %r9d +; CHECK-SSE1-NEXT: andl %ecx, %r9d +; CHECK-SSE1-NEXT: orl %esi, %r9d +; CHECK-SSE1-NEXT: andl %edx, %r8d +; CHECK-SSE1-NEXT: orl %edi, %r8d +; CHECK-SSE1-NEXT: movl %r8d, %eax +; CHECK-SSE1-NEXT: movl %r9d, %edx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v2i16: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v2i16: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <2 x i16> %x, %mask + %notmask = xor <2 x i16> %mask, + %my = and <2 x i16> %y, %notmask + %r = or <2 x i16> %mx, %my + ret <2 x i16> %r +} + +define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { +; CHECK-LABEL: out_v1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: andl %edx, %edi +; CHECK-NEXT: notl %edx +; CHECK-NEXT: andl %esi, %edx +; CHECK-NEXT: orl %edi, %edx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: retq + %mx = and <1 x i32> %x, %mask + %notmask = xor <1 x i32> %mask, + %my = and <1 x i32> %y, %notmask + %r = or <1 x i32> %mx, %my + ret <1 x i32> %r +} + +; ============================================================================ ; +; 64-bit vector width +; ============================================================================ ; + +define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v8i8: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbp +; CHECK-BASELINE-NEXT: pushq %r15 +; CHECK-BASELINE-NEXT: pushq %r14 +; CHECK-BASELINE-NEXT: pushq %r13 +; CHECK-BASELINE-NEXT: pushq %r12 +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: andb %al, %r9b +; CHECK-BASELINE-NEXT: andb %bl, %r8b +; CHECK-BASELINE-NEXT: andb %r14b, %cl +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: andb %r11b, %dl +; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: andb %r10b, %sil +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: andb %r12b, %r13b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: andb %r15b, %cl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: andb %bpl, %dl +; CHECK-BASELINE-NEXT: notb %r10b +; CHECK-BASELINE-NEXT: notb %r11b +; CHECK-BASELINE-NEXT: notb %r14b +; CHECK-BASELINE-NEXT: notb %bl +; CHECK-BASELINE-NEXT: notb %al +; CHECK-BASELINE-NEXT: notb %bpl +; CHECK-BASELINE-NEXT: notb %r15b +; CHECK-BASELINE-NEXT: notb %r12b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: orb %r13b, %r12b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: orb %cl, %r15b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: orb %dl, %bpl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: orb %r9b, %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: orb %r8b, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: orb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: orb %sil, %r10b +; CHECK-BASELINE-NEXT: movb %r12b, 7(%rdi) +; CHECK-BASELINE-NEXT: movb %r15b, 6(%rdi) +; CHECK-BASELINE-NEXT: movb %bpl, 5(%rdi) +; CHECK-BASELINE-NEXT: movb %al, 4(%rdi) +; CHECK-BASELINE-NEXT: movb %bl, 3(%rdi) +; CHECK-BASELINE-NEXT: movb %r14b, 2(%rdi) +; CHECK-BASELINE-NEXT: movb %r11b, 1(%rdi) +; CHECK-BASELINE-NEXT: movb %r10b, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: popq %r12 +; CHECK-BASELINE-NEXT: popq %r13 +; CHECK-BASELINE-NEXT: popq %r14 +; CHECK-BASELINE-NEXT: popq %r15 +; CHECK-BASELINE-NEXT: popq %rbp +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v8i8: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: pushq %rbp +; CHECK-SSE1-NEXT: pushq %r15 +; CHECK-SSE1-NEXT: pushq %r14 +; CHECK-SSE1-NEXT: pushq %r13 +; CHECK-SSE1-NEXT: pushq %r12 +; CHECK-SSE1-NEXT: pushq %rbx +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: andb %al, %r9b +; CHECK-SSE1-NEXT: andb %bl, %r8b +; CHECK-SSE1-NEXT: andb %r14b, %cl +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: andb %r11b, %dl +; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: andb %r10b, %sil +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: andb %r12b, %r13b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: andb %r15b, %cl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: andb %bpl, %dl +; CHECK-SSE1-NEXT: notb %r10b +; CHECK-SSE1-NEXT: notb %r11b +; CHECK-SSE1-NEXT: notb %r14b +; CHECK-SSE1-NEXT: notb %bl +; CHECK-SSE1-NEXT: notb %al +; CHECK-SSE1-NEXT: notb %bpl +; CHECK-SSE1-NEXT: notb %r15b +; CHECK-SSE1-NEXT: notb %r12b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: orb %r13b, %r12b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: orb %cl, %r15b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: orb %dl, %bpl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: orb %r9b, %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: orb %r8b, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: orb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: orb %sil, %r10b +; CHECK-SSE1-NEXT: movb %r12b, 7(%rdi) +; CHECK-SSE1-NEXT: movb %r15b, 6(%rdi) +; CHECK-SSE1-NEXT: movb %bpl, 5(%rdi) +; CHECK-SSE1-NEXT: movb %al, 4(%rdi) +; CHECK-SSE1-NEXT: movb %bl, 3(%rdi) +; CHECK-SSE1-NEXT: movb %r14b, 2(%rdi) +; CHECK-SSE1-NEXT: movb %r11b, 1(%rdi) +; CHECK-SSE1-NEXT: movb %r10b, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: popq %r12 +; CHECK-SSE1-NEXT: popq %r13 +; CHECK-SSE1-NEXT: popq %r14 +; CHECK-SSE1-NEXT: popq %r15 +; CHECK-SSE1-NEXT: popq %rbp +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v8i8: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v8i8: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <8 x i8> %x, %mask + %notmask = xor <8 x i8> %mask, + %my = and <8 x i8> %y, %notmask + %r = or <8 x i8> %mx, %my + ret <8 x i8> %r +} + +define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v4i16: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andl %ebx, %esi +; CHECK-BASELINE-NEXT: andl %eax, %r8d +; CHECK-BASELINE-NEXT: andl %r11d, %ecx +; CHECK-BASELINE-NEXT: andl %r10d, %edx +; CHECK-BASELINE-NEXT: notl %r10d +; CHECK-BASELINE-NEXT: notl %r11d +; CHECK-BASELINE-NEXT: notl %eax +; CHECK-BASELINE-NEXT: notl %ebx +; CHECK-BASELINE-NEXT: andl %r9d, %ebx +; CHECK-BASELINE-NEXT: orl %esi, %ebx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %ax +; CHECK-BASELINE-NEXT: orl %r8d, %eax +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: orl %ecx, %r11d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: orl %edx, %r10d +; CHECK-BASELINE-NEXT: movw %bx, (%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) +; CHECK-BASELINE-NEXT: movw %r11w, 4(%rdi) +; CHECK-BASELINE-NEXT: movw %r10w, 2(%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v4i16: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: pushq %rbx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andl %ebx, %esi +; CHECK-SSE1-NEXT: andl %eax, %r8d +; CHECK-SSE1-NEXT: andl %r11d, %ecx +; CHECK-SSE1-NEXT: andl %r10d, %edx +; CHECK-SSE1-NEXT: notl %r10d +; CHECK-SSE1-NEXT: notl %r11d +; CHECK-SSE1-NEXT: notl %eax +; CHECK-SSE1-NEXT: notl %ebx +; CHECK-SSE1-NEXT: andl %r9d, %ebx +; CHECK-SSE1-NEXT: orl %esi, %ebx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %ax +; CHECK-SSE1-NEXT: orl %r8d, %eax +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: orl %ecx, %r11d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: orl %edx, %r10d +; CHECK-SSE1-NEXT: movw %bx, (%rdi) +; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) +; CHECK-SSE1-NEXT: movw %r11w, 4(%rdi) +; CHECK-SSE1-NEXT: movw %r10w, 2(%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v4i16: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v4i16: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <4 x i16> %x, %mask + %notmask = xor <4 x i16> %mask, + %my = and <4 x i16> %y, %notmask + %r = or <4 x i16> %mx, %my + ret <4 x i16> %r +} + +define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v4i16_undef: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: andl %eax, %esi +; CHECK-BASELINE-NEXT: andl %r11d, %r8d +; CHECK-BASELINE-NEXT: andl %r10d, %edx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-BASELINE-NEXT: notl %r10d +; CHECK-BASELINE-NEXT: notl %r11d +; CHECK-BASELINE-NEXT: notl %eax +; CHECK-BASELINE-NEXT: andl %r9d, %eax +; CHECK-BASELINE-NEXT: orl %esi, %eax +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: orl %r8d, %r11d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: orl %edx, %r10d +; CHECK-BASELINE-NEXT: movw %cx, 4(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, (%rdi) +; CHECK-BASELINE-NEXT: movw %r11w, 6(%rdi) +; CHECK-BASELINE-NEXT: movw %r10w, 2(%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v4i16_undef: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: andl %eax, %esi +; CHECK-SSE1-NEXT: andl %r11d, %r8d +; CHECK-SSE1-NEXT: andl %r10d, %edx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-SSE1-NEXT: notl %r10d +; CHECK-SSE1-NEXT: notl %r11d +; CHECK-SSE1-NEXT: notl %eax +; CHECK-SSE1-NEXT: andl %r9d, %eax +; CHECK-SSE1-NEXT: orl %esi, %eax +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: orl %r8d, %r11d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: orl %edx, %r10d +; CHECK-SSE1-NEXT: movw %cx, 4(%rdi) +; CHECK-SSE1-NEXT: movw %ax, (%rdi) +; CHECK-SSE1-NEXT: movw %r11w, 6(%rdi) +; CHECK-SSE1-NEXT: movw %r10w, 2(%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v4i16_undef: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v4i16_undef: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <4 x i16> %x, %mask + %notmask = xor <4 x i16> %mask, + %my = and <4 x i16> %y, %notmask + %r = or <4 x i16> %mx, %my + ret <4 x i16> %r +} + +define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v2i32: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: andl %r9d, %esi +; CHECK-BASELINE-NEXT: andl %r8d, %edi +; CHECK-BASELINE-NEXT: notl %r8d +; CHECK-BASELINE-NEXT: notl %r9d +; CHECK-BASELINE-NEXT: andl %ecx, %r9d +; CHECK-BASELINE-NEXT: orl %esi, %r9d +; CHECK-BASELINE-NEXT: andl %edx, %r8d +; CHECK-BASELINE-NEXT: orl %edi, %r8d +; CHECK-BASELINE-NEXT: movl %r8d, %eax +; CHECK-BASELINE-NEXT: movl %r9d, %edx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v2i32: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: andl %r9d, %esi +; CHECK-SSE1-NEXT: andl %r8d, %edi +; CHECK-SSE1-NEXT: notl %r8d +; CHECK-SSE1-NEXT: notl %r9d +; CHECK-SSE1-NEXT: andl %ecx, %r9d +; CHECK-SSE1-NEXT: orl %esi, %r9d +; CHECK-SSE1-NEXT: andl %edx, %r8d +; CHECK-SSE1-NEXT: orl %edi, %r8d +; CHECK-SSE1-NEXT: movl %r8d, %eax +; CHECK-SSE1-NEXT: movl %r9d, %edx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v2i32: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps {{.*}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: andps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v2i32: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <2 x i32> %x, %mask + %notmask = xor <2 x i32> %mask, + %my = and <2 x i32> %y, %notmask + %r = or <2 x i32> %mx, %my + ret <2 x i32> %r +} + +define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { +; CHECK-LABEL: out_v1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: andq %rdx, %rdi +; CHECK-NEXT: notq %rdx +; CHECK-NEXT: andq %rsi, %rdx +; CHECK-NEXT: orq %rdi, %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: retq + %mx = and <1 x i64> %x, %mask + %notmask = xor <1 x i64> %mask, + %my = and <1 x i64> %y, %notmask + %r = or <1 x i64> %mx, %my + ret <1 x i64> %r +} + +; ============================================================================ ; +; 128-bit vector width +; ============================================================================ ; + +define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v16i8: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbp +; CHECK-BASELINE-NEXT: pushq %r15 +; CHECK-BASELINE-NEXT: pushq %r14 +; CHECK-BASELINE-NEXT: pushq %r13 +; CHECK-BASELINE-NEXT: pushq %r12 +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %al, %sil +; CHECK-BASELINE-NEXT: notb %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: orb %sil, %al +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %cl, %sil +; CHECK-BASELINE-NEXT: notb %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: orb %sil, %cl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %dl, %sil +; CHECK-BASELINE-NEXT: notb %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: orb %sil, %dl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %bl, %sil +; CHECK-BASELINE-NEXT: notb %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: orb %sil, %bl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %r13b, %sil +; CHECK-BASELINE-NEXT: notb %r13b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: orb %sil, %r13b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %r12b, %sil +; CHECK-BASELINE-NEXT: notb %r12b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: orb %sil, %r12b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %r15b, %sil +; CHECK-BASELINE-NEXT: notb %r15b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: orb %sil, %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %r14b, %sil +; CHECK-BASELINE-NEXT: notb %r14b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: orb %sil, %r14b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %bpl, %sil +; CHECK-BASELINE-NEXT: notb %bpl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: orb %sil, %bpl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %r11b, %sil +; CHECK-BASELINE-NEXT: notb %r11b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: orb %sil, %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb %r10b, %sil +; CHECK-BASELINE-NEXT: notb %r10b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: orb %sil, %r10b +; CHECK-BASELINE-NEXT: movb %al, 15(%rdi) +; CHECK-BASELINE-NEXT: movb %cl, 14(%rdi) +; CHECK-BASELINE-NEXT: movb %dl, 13(%rdi) +; CHECK-BASELINE-NEXT: movb %bl, 12(%rdi) +; CHECK-BASELINE-NEXT: movb %r13b, 11(%rdi) +; CHECK-BASELINE-NEXT: movb %r12b, 10(%rdi) +; CHECK-BASELINE-NEXT: movb %r15b, 9(%rdi) +; CHECK-BASELINE-NEXT: movb %r14b, 8(%rdi) +; CHECK-BASELINE-NEXT: movb %bpl, 7(%rdi) +; CHECK-BASELINE-NEXT: movb %r11b, 6(%rdi) +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: andb %al, %r9b +; CHECK-BASELINE-NEXT: notb %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: orb %r9b, %al +; CHECK-BASELINE-NEXT: movb %r10b, 5(%rdi) +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: andb %cl, %r8b +; CHECK-BASELINE-NEXT: notb %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: orb %r8b, %cl +; CHECK-BASELINE-NEXT: movb %al, 4(%rdi) +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-BASELINE-NEXT: andb %al, %dl +; CHECK-BASELINE-NEXT: notb %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: orb %dl, %al +; CHECK-BASELINE-NEXT: movb %cl, 3(%rdi) +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-BASELINE-NEXT: andb %cl, %dl +; CHECK-BASELINE-NEXT: notb %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: orb %dl, %cl +; CHECK-BASELINE-NEXT: movb %al, 2(%rdi) +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-BASELINE-NEXT: andb %al, %dl +; CHECK-BASELINE-NEXT: notb %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: orb %dl, %al +; CHECK-BASELINE-NEXT: movb %cl, 1(%rdi) +; CHECK-BASELINE-NEXT: movb %al, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: popq %r12 +; CHECK-BASELINE-NEXT: popq %r13 +; CHECK-BASELINE-NEXT: popq %r14 +; CHECK-BASELINE-NEXT: popq %r15 +; CHECK-BASELINE-NEXT: popq %rbp +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v16i8: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: pushq %rbp +; CHECK-SSE1-NEXT: pushq %r15 +; CHECK-SSE1-NEXT: pushq %r14 +; CHECK-SSE1-NEXT: pushq %r13 +; CHECK-SSE1-NEXT: pushq %r12 +; CHECK-SSE1-NEXT: pushq %rbx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %al, %sil +; CHECK-SSE1-NEXT: notb %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: orb %sil, %al +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %cl, %sil +; CHECK-SSE1-NEXT: notb %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: orb %sil, %cl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %dl, %sil +; CHECK-SSE1-NEXT: notb %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: orb %sil, %dl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %bl, %sil +; CHECK-SSE1-NEXT: notb %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: orb %sil, %bl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %r13b, %sil +; CHECK-SSE1-NEXT: notb %r13b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: orb %sil, %r13b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %r12b, %sil +; CHECK-SSE1-NEXT: notb %r12b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: orb %sil, %r12b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %r15b, %sil +; CHECK-SSE1-NEXT: notb %r15b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: orb %sil, %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %r14b, %sil +; CHECK-SSE1-NEXT: notb %r14b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: orb %sil, %r14b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %bpl, %sil +; CHECK-SSE1-NEXT: notb %bpl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: orb %sil, %bpl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %r11b, %sil +; CHECK-SSE1-NEXT: notb %r11b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: orb %sil, %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb %r10b, %sil +; CHECK-SSE1-NEXT: notb %r10b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: orb %sil, %r10b +; CHECK-SSE1-NEXT: movb %al, 15(%rdi) +; CHECK-SSE1-NEXT: movb %cl, 14(%rdi) +; CHECK-SSE1-NEXT: movb %dl, 13(%rdi) +; CHECK-SSE1-NEXT: movb %bl, 12(%rdi) +; CHECK-SSE1-NEXT: movb %r13b, 11(%rdi) +; CHECK-SSE1-NEXT: movb %r12b, 10(%rdi) +; CHECK-SSE1-NEXT: movb %r15b, 9(%rdi) +; CHECK-SSE1-NEXT: movb %r14b, 8(%rdi) +; CHECK-SSE1-NEXT: movb %bpl, 7(%rdi) +; CHECK-SSE1-NEXT: movb %r11b, 6(%rdi) +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: andb %al, %r9b +; CHECK-SSE1-NEXT: notb %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: orb %r9b, %al +; CHECK-SSE1-NEXT: movb %r10b, 5(%rdi) +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: andb %cl, %r8b +; CHECK-SSE1-NEXT: notb %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: orb %r8b, %cl +; CHECK-SSE1-NEXT: movb %al, 4(%rdi) +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-SSE1-NEXT: andb %al, %dl +; CHECK-SSE1-NEXT: notb %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: orb %dl, %al +; CHECK-SSE1-NEXT: movb %cl, 3(%rdi) +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-SSE1-NEXT: andb %cl, %dl +; CHECK-SSE1-NEXT: notb %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: orb %dl, %cl +; CHECK-SSE1-NEXT: movb %al, 2(%rdi) +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-SSE1-NEXT: andb %al, %dl +; CHECK-SSE1-NEXT: notb %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: orb %dl, %al +; CHECK-SSE1-NEXT: movb %cl, 1(%rdi) +; CHECK-SSE1-NEXT: movb %al, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: popq %r12 +; CHECK-SSE1-NEXT: popq %r13 +; CHECK-SSE1-NEXT: popq %r14 +; CHECK-SSE1-NEXT: popq %r15 +; CHECK-SSE1-NEXT: popq %rbp +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v16i8: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v16i8: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <16 x i8> %x, %mask + %notmask = xor <16 x i8> %mask, + %my = and <16 x i8> %y, %notmask + %r = or <16 x i8> %mx, %my + ret <16 x i8> %r +} + +define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v8i16: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbp +; CHECK-BASELINE-NEXT: pushq %r14 +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andw %r14w, %bx +; CHECK-BASELINE-NEXT: notl %r14d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r14w +; CHECK-BASELINE-NEXT: orl %ebx, %r14d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andw %r11w, %bx +; CHECK-BASELINE-NEXT: notl %r11d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: orl %ebx, %r11d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andw %r10w, %bx +; CHECK-BASELINE-NEXT: notl %r10d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: orl %ebx, %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andl %ebx, %r9d +; CHECK-BASELINE-NEXT: notl %ebx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-BASELINE-NEXT: orl %r9d, %ebx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: andl %eax, %r8d +; CHECK-BASELINE-NEXT: notl %eax +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %ax +; CHECK-BASELINE-NEXT: orl %r8d, %eax +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebp +; CHECK-BASELINE-NEXT: andl %ebp, %ecx +; CHECK-BASELINE-NEXT: notl %ebp +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bp +; CHECK-BASELINE-NEXT: orl %ecx, %ebp +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-BASELINE-NEXT: andl %ecx, %edx +; CHECK-BASELINE-NEXT: notl %ecx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-BASELINE-NEXT: orl %edx, %ecx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edx +; CHECK-BASELINE-NEXT: andl %edx, %esi +; CHECK-BASELINE-NEXT: notl %edx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-BASELINE-NEXT: orl %esi, %edx +; CHECK-BASELINE-NEXT: movw %r14w, 14(%rdi) +; CHECK-BASELINE-NEXT: movw %r11w, 12(%rdi) +; CHECK-BASELINE-NEXT: movw %r10w, 10(%rdi) +; CHECK-BASELINE-NEXT: movw %bx, 8(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) +; CHECK-BASELINE-NEXT: movw %bp, 4(%rdi) +; CHECK-BASELINE-NEXT: movw %cx, 2(%rdi) +; CHECK-BASELINE-NEXT: movw %dx, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: popq %r14 +; CHECK-BASELINE-NEXT: popq %rbp +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v8i16: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: pushq %rbp +; CHECK-SSE1-NEXT: pushq %r14 +; CHECK-SSE1-NEXT: pushq %rbx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andw %r14w, %bx +; CHECK-SSE1-NEXT: notl %r14d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r14w +; CHECK-SSE1-NEXT: orl %ebx, %r14d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andw %r11w, %bx +; CHECK-SSE1-NEXT: notl %r11d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: orl %ebx, %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andw %r10w, %bx +; CHECK-SSE1-NEXT: notl %r10d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: orl %ebx, %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andl %ebx, %r9d +; CHECK-SSE1-NEXT: notl %ebx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-SSE1-NEXT: orl %r9d, %ebx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: andl %eax, %r8d +; CHECK-SSE1-NEXT: notl %eax +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %ax +; CHECK-SSE1-NEXT: orl %r8d, %eax +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebp +; CHECK-SSE1-NEXT: andl %ebp, %ecx +; CHECK-SSE1-NEXT: notl %ebp +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bp +; CHECK-SSE1-NEXT: orl %ecx, %ebp +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-SSE1-NEXT: andl %ecx, %edx +; CHECK-SSE1-NEXT: notl %ecx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-SSE1-NEXT: orl %edx, %ecx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edx +; CHECK-SSE1-NEXT: andl %edx, %esi +; CHECK-SSE1-NEXT: notl %edx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-SSE1-NEXT: orl %esi, %edx +; CHECK-SSE1-NEXT: movw %r14w, 14(%rdi) +; CHECK-SSE1-NEXT: movw %r11w, 12(%rdi) +; CHECK-SSE1-NEXT: movw %r10w, 10(%rdi) +; CHECK-SSE1-NEXT: movw %bx, 8(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) +; CHECK-SSE1-NEXT: movw %bp, 4(%rdi) +; CHECK-SSE1-NEXT: movw %cx, 2(%rdi) +; CHECK-SSE1-NEXT: movw %dx, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: popq %r14 +; CHECK-SSE1-NEXT: popq %rbp +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v8i16: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v8i16: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <8 x i16> %x, %mask + %notmask = xor <8 x i16> %mask, + %my = and <8 x i16> %y, %notmask + %r = or <8 x i16> %mx, %my + ret <8 x i16> %r +} + +define <4 x i32> @out_v4i32(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) nounwind { +; CHECK-BASELINE-LABEL: out_v4i32: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movl (%rcx), %r8d +; CHECK-BASELINE-NEXT: movl 4(%rcx), %r9d +; CHECK-BASELINE-NEXT: movl 8(%rcx), %eax +; CHECK-BASELINE-NEXT: movl 12(%rcx), %ecx +; CHECK-BASELINE-NEXT: movl 12(%rsi), %r10d +; CHECK-BASELINE-NEXT: andl %ecx, %r10d +; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d +; CHECK-BASELINE-NEXT: andl %eax, %r11d +; CHECK-BASELINE-NEXT: movl 4(%rsi), %ebx +; CHECK-BASELINE-NEXT: andl %r9d, %ebx +; CHECK-BASELINE-NEXT: movl (%rsi), %esi +; CHECK-BASELINE-NEXT: andl %r8d, %esi +; CHECK-BASELINE-NEXT: notl %r8d +; CHECK-BASELINE-NEXT: notl %r9d +; CHECK-BASELINE-NEXT: notl %eax +; CHECK-BASELINE-NEXT: notl %ecx +; CHECK-BASELINE-NEXT: andl 12(%rdx), %ecx +; CHECK-BASELINE-NEXT: orl %r10d, %ecx +; CHECK-BASELINE-NEXT: andl 8(%rdx), %eax +; CHECK-BASELINE-NEXT: orl %r11d, %eax +; CHECK-BASELINE-NEXT: andl 4(%rdx), %r9d +; CHECK-BASELINE-NEXT: orl %ebx, %r9d +; CHECK-BASELINE-NEXT: andl (%rdx), %r8d +; CHECK-BASELINE-NEXT: orl %esi, %r8d +; CHECK-BASELINE-NEXT: movl %ecx, 12(%rdi) +; CHECK-BASELINE-NEXT: movl %eax, 8(%rdi) +; CHECK-BASELINE-NEXT: movl %r9d, 4(%rdi) +; CHECK-BASELINE-NEXT: movl %r8d, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v4i32: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v4i32: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps (%rdi), %xmm1 +; CHECK-SSE2-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v4i32: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1 +; CHECK-XOP-NEXT: vpcmov %xmm1, (%rsi), %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %mx = and <4 x i32> %x, %mask + %notmask = xor <4 x i32> %mask, + %my = and <4 x i32> %y, %notmask + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <4 x i32> @out_v4i32_undef(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) nounwind { +; CHECK-BASELINE-LABEL: out_v4i32_undef: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: movl 8(%rsi), %r8d +; CHECK-BASELINE-NEXT: movl (%rcx), %r9d +; CHECK-BASELINE-NEXT: movl 4(%rcx), %r10d +; CHECK-BASELINE-NEXT: movl 12(%rcx), %eax +; CHECK-BASELINE-NEXT: andl 8(%rcx), %r8d +; CHECK-BASELINE-NEXT: movl 12(%rsi), %ecx +; CHECK-BASELINE-NEXT: andl %eax, %ecx +; CHECK-BASELINE-NEXT: movl 4(%rsi), %r11d +; CHECK-BASELINE-NEXT: andl %r10d, %r11d +; CHECK-BASELINE-NEXT: movl (%rsi), %esi +; CHECK-BASELINE-NEXT: andl %r9d, %esi +; CHECK-BASELINE-NEXT: notl %r9d +; CHECK-BASELINE-NEXT: notl %r10d +; CHECK-BASELINE-NEXT: notl %eax +; CHECK-BASELINE-NEXT: andl 12(%rdx), %eax +; CHECK-BASELINE-NEXT: orl %ecx, %eax +; CHECK-BASELINE-NEXT: andl 4(%rdx), %r10d +; CHECK-BASELINE-NEXT: orl %r11d, %r10d +; CHECK-BASELINE-NEXT: andl (%rdx), %r9d +; CHECK-BASELINE-NEXT: orl %esi, %r9d +; CHECK-BASELINE-NEXT: movl %r8d, 8(%rdi) +; CHECK-BASELINE-NEXT: movl %eax, 12(%rdi) +; CHECK-BASELINE-NEXT: movl %r10d, 4(%rdi) +; CHECK-BASELINE-NEXT: movl %r9d, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v4i32_undef: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v4i32_undef: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movaps (%rdi), %xmm1 +; CHECK-SSE2-NEXT: andps %xmm0, %xmm1 +; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v4i32_undef: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1 +; CHECK-XOP-NEXT: vpcmov %xmm1, (%rsi), %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %mx = and <4 x i32> %x, %mask + %notmask = xor <4 x i32> %mask, + %my = and <4 x i32> %y, %notmask + %r = or <4 x i32> %mx, %my + ret <4 x i32> %r +} + +define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { +; CHECK-BASELINE-LABEL: out_v2i64: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: andq %r9, %rsi +; CHECK-BASELINE-NEXT: andq %r8, %rdi +; CHECK-BASELINE-NEXT: notq %r8 +; CHECK-BASELINE-NEXT: notq %r9 +; CHECK-BASELINE-NEXT: andq %rcx, %r9 +; CHECK-BASELINE-NEXT: orq %rsi, %r9 +; CHECK-BASELINE-NEXT: andq %rdx, %r8 +; CHECK-BASELINE-NEXT: orq %rdi, %r8 +; CHECK-BASELINE-NEXT: movq %r8, %rax +; CHECK-BASELINE-NEXT: movq %r9, %rdx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: out_v2i64: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: andq %r9, %rsi +; CHECK-SSE1-NEXT: andq %r8, %rdi +; CHECK-SSE1-NEXT: notq %r8 +; CHECK-SSE1-NEXT: notq %r9 +; CHECK-SSE1-NEXT: andq %rcx, %r9 +; CHECK-SSE1-NEXT: orq %rsi, %r9 +; CHECK-SSE1-NEXT: andq %rdx, %r8 +; CHECK-SSE1-NEXT: orq %rdi, %r8 +; CHECK-SSE1-NEXT: movq %r8, %rax +; CHECK-SSE1-NEXT: movq %r9, %rdx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: out_v2i64: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: out_v2i64: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %mx = and <2 x i64> %x, %mask + %notmask = xor <2 x i64> %mask, + %my = and <2 x i64> %y, %notmask + %r = or <2 x i64> %mx, %my + ret <2 x i64> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Should be the same as the previous one. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; ============================================================================ ; +; 8-bit vector width +; ============================================================================ ; + +define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { +; CHECK-LABEL: in_v1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: andl %edx, %edi +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %n0 = xor <1 x i8> %x, %y + %n1 = and <1 x i8> %n0, %mask + %r = xor <1 x i8> %n1, %y + ret <1 x i8> %r +} + +; ============================================================================ ; +; 16-bit vector width +; ============================================================================ ; + +define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { +; CHECK-BASELINE-LABEL: in_v2i8: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: xorl %edx, %edi +; CHECK-BASELINE-NEXT: andl %r8d, %edi +; CHECK-BASELINE-NEXT: andl %r9d, %esi +; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: xorl %edx, %edi +; CHECK-BASELINE-NEXT: movl %edi, %eax +; CHECK-BASELINE-NEXT: movl %esi, %edx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v2i8: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: xorl %edx, %edi +; CHECK-SSE1-NEXT: andl %r8d, %edi +; CHECK-SSE1-NEXT: andl %r9d, %esi +; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: xorl %edx, %edi +; CHECK-SSE1-NEXT: movl %edi, %eax +; CHECK-SSE1-NEXT: movl %esi, %edx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v2i8: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v2i8: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %n0 = xor <2 x i8> %x, %y + %n1 = and <2 x i8> %n0, %mask + %r = xor <2 x i8> %n1, %y + ret <2 x i8> %r +} + +define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { +; CHECK-LABEL: in_v1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: andl %edx, %edi +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %n0 = xor <1 x i16> %x, %y + %n1 = and <1 x i16> %n0, %mask + %r = xor <1 x i16> %n1, %y + ret <1 x i16> %r +} + +; ============================================================================ ; +; 32-bit vector width +; ============================================================================ ; + +define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { +; CHECK-BASELINE-LABEL: in_v4i8: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: xorb %r11b, %cl +; CHECK-BASELINE-NEXT: xorb %r10b, %r8b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: xorb %r9b, %sil +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: xorb %r11b, %cl +; CHECK-BASELINE-NEXT: xorb %r10b, %r8b +; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdi) +; CHECK-BASELINE-NEXT: movb %cl, 2(%rdi) +; CHECK-BASELINE-NEXT: movb %dl, 1(%rdi) +; CHECK-BASELINE-NEXT: movb %sil, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v4i8: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: xorb %r11b, %cl +; CHECK-SSE1-NEXT: xorb %r10b, %r8b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: xorb %r9b, %sil +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: xorb %r11b, %cl +; CHECK-SSE1-NEXT: xorb %r10b, %r8b +; CHECK-SSE1-NEXT: movb %r8b, 3(%rdi) +; CHECK-SSE1-NEXT: movb %cl, 2(%rdi) +; CHECK-SSE1-NEXT: movb %dl, 1(%rdi) +; CHECK-SSE1-NEXT: movb %sil, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v4i8: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v4i8: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %n0 = xor <4 x i8> %x, %y + %n1 = and <4 x i8> %n0, %mask + %r = xor <4 x i8> %n1, %y + ret <4 x i8> %r +} + +define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { +; CHECK-BASELINE-LABEL: in_v2i16: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: xorl %edx, %edi +; CHECK-BASELINE-NEXT: andl %r8d, %edi +; CHECK-BASELINE-NEXT: andl %r9d, %esi +; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: xorl %edx, %edi +; CHECK-BASELINE-NEXT: movl %edi, %eax +; CHECK-BASELINE-NEXT: movl %esi, %edx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v2i16: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: xorl %edx, %edi +; CHECK-SSE1-NEXT: andl %r8d, %edi +; CHECK-SSE1-NEXT: andl %r9d, %esi +; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: xorl %edx, %edi +; CHECK-SSE1-NEXT: movl %edi, %eax +; CHECK-SSE1-NEXT: movl %esi, %edx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v2i16: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v2i16: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %n0 = xor <2 x i16> %x, %y + %n1 = and <2 x i16> %n0, %mask + %r = xor <2 x i16> %n1, %y + ret <2 x i16> %r +} + +define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { +; CHECK-LABEL: in_v1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: andl %edx, %edi +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + %n0 = xor <1 x i32> %x, %y + %n1 = and <1 x i32> %n0, %mask + %r = xor <1 x i32> %n1, %y + ret <1 x i32> %r +} + +; ============================================================================ ; +; 64-bit vector width +; ============================================================================ ; + +define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { +; CHECK-BASELINE-LABEL: in_v8i8: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbp +; CHECK-BASELINE-NEXT: pushq %r15 +; CHECK-BASELINE-NEXT: pushq %r14 +; CHECK-BASELINE-NEXT: pushq %r13 +; CHECK-BASELINE-NEXT: pushq %r12 +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movl %ecx, %r10d +; CHECK-BASELINE-NEXT: movl %edx, %r11d +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: xorb %bpl, %sil +; CHECK-BASELINE-NEXT: xorb %r13b, %r11b +; CHECK-BASELINE-NEXT: xorb %r12b, %r10b +; CHECK-BASELINE-NEXT: xorb %r15b, %r8b +; CHECK-BASELINE-NEXT: xorb %r14b, %r9b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb %bpl, %sil +; CHECK-BASELINE-NEXT: xorb %r13b, %r11b +; CHECK-BASELINE-NEXT: xorb %r12b, %r10b +; CHECK-BASELINE-NEXT: xorb %r15b, %r8b +; CHECK-BASELINE-NEXT: xorb %r14b, %r9b +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: movb %al, 7(%rdi) +; CHECK-BASELINE-NEXT: movb %cl, 6(%rdi) +; CHECK-BASELINE-NEXT: movb %dl, 5(%rdi) +; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) +; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdi) +; CHECK-BASELINE-NEXT: movb %r10b, 2(%rdi) +; CHECK-BASELINE-NEXT: movb %r11b, 1(%rdi) +; CHECK-BASELINE-NEXT: movb %sil, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: popq %r12 +; CHECK-BASELINE-NEXT: popq %r13 +; CHECK-BASELINE-NEXT: popq %r14 +; CHECK-BASELINE-NEXT: popq %r15 +; CHECK-BASELINE-NEXT: popq %rbp +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v8i8: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: pushq %rbp +; CHECK-SSE1-NEXT: pushq %r15 +; CHECK-SSE1-NEXT: pushq %r14 +; CHECK-SSE1-NEXT: pushq %r13 +; CHECK-SSE1-NEXT: pushq %r12 +; CHECK-SSE1-NEXT: pushq %rbx +; CHECK-SSE1-NEXT: movl %ecx, %r10d +; CHECK-SSE1-NEXT: movl %edx, %r11d +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: xorb %bpl, %sil +; CHECK-SSE1-NEXT: xorb %r13b, %r11b +; CHECK-SSE1-NEXT: xorb %r12b, %r10b +; CHECK-SSE1-NEXT: xorb %r15b, %r8b +; CHECK-SSE1-NEXT: xorb %r14b, %r9b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb %bpl, %sil +; CHECK-SSE1-NEXT: xorb %r13b, %r11b +; CHECK-SSE1-NEXT: xorb %r12b, %r10b +; CHECK-SSE1-NEXT: xorb %r15b, %r8b +; CHECK-SSE1-NEXT: xorb %r14b, %r9b +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: movb %al, 7(%rdi) +; CHECK-SSE1-NEXT: movb %cl, 6(%rdi) +; CHECK-SSE1-NEXT: movb %dl, 5(%rdi) +; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) +; CHECK-SSE1-NEXT: movb %r8b, 3(%rdi) +; CHECK-SSE1-NEXT: movb %r10b, 2(%rdi) +; CHECK-SSE1-NEXT: movb %r11b, 1(%rdi) +; CHECK-SSE1-NEXT: movb %sil, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: popq %r12 +; CHECK-SSE1-NEXT: popq %r13 +; CHECK-SSE1-NEXT: popq %r14 +; CHECK-SSE1-NEXT: popq %r15 +; CHECK-SSE1-NEXT: popq %rbp +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v8i8: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v8i8: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %n0 = xor <8 x i8> %x, %y + %n1 = and <8 x i8> %n0, %mask + %r = xor <8 x i8> %n1, %y + ret <8 x i8> %r +} + +define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind { +; CHECK-BASELINE-LABEL: in_v4i16: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: xorl %r11d, %ecx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: xorl %eax, %edx +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: xorl %eax, %edx +; CHECK-BASELINE-NEXT: xorl %r11d, %ecx +; CHECK-BASELINE-NEXT: xorl %r10d, %r8d +; CHECK-BASELINE-NEXT: movw %r8w, 6(%rdi) +; CHECK-BASELINE-NEXT: movw %cx, 4(%rdi) +; CHECK-BASELINE-NEXT: movw %dx, 2(%rdi) +; CHECK-BASELINE-NEXT: movw %si, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v4i16: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: xorl %r11d, %ecx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: xorl %eax, %edx +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: xorl %eax, %edx +; CHECK-SSE1-NEXT: xorl %r11d, %ecx +; CHECK-SSE1-NEXT: xorl %r10d, %r8d +; CHECK-SSE1-NEXT: movw %r8w, 6(%rdi) +; CHECK-SSE1-NEXT: movw %cx, 4(%rdi) +; CHECK-SSE1-NEXT: movw %dx, 2(%rdi) +; CHECK-SSE1-NEXT: movw %si, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v4i16: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v4i16: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %n0 = xor <4 x i16> %x, %y + %n1 = and <4 x i16> %n0, %mask + %r = xor <4 x i16> %n1, %y + ret <4 x i16> %r +} + +define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { +; CHECK-BASELINE-LABEL: in_v2i32: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: xorl %edx, %edi +; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: andl %r9d, %esi +; CHECK-BASELINE-NEXT: andl %r8d, %edi +; CHECK-BASELINE-NEXT: xorl %edx, %edi +; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: movl %edi, %eax +; CHECK-BASELINE-NEXT: movl %esi, %edx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v2i32: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: xorl %edx, %edi +; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: andl %r9d, %esi +; CHECK-SSE1-NEXT: andl %r8d, %edi +; CHECK-SSE1-NEXT: xorl %edx, %edi +; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: movl %edi, %eax +; CHECK-SSE1-NEXT: movl %esi, %edx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v2i32: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v2i32: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %n0 = xor <2 x i32> %x, %y + %n1 = and <2 x i32> %n0, %mask + %r = xor <2 x i32> %n1, %y + ret <2 x i32> %r +} + +define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { +; CHECK-LABEL: in_v1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: xorq %rsi, %rdi +; CHECK-NEXT: andq %rdx, %rdi +; CHECK-NEXT: xorq %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq + %n0 = xor <1 x i64> %x, %y + %n1 = and <1 x i64> %n0, %mask + %r = xor <1 x i64> %n1, %y + ret <1 x i64> %r +} + +; ============================================================================ ; +; 128-bit vector width +; ============================================================================ ; + +define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind { +; CHECK-BASELINE-LABEL: in_v16i8: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbp +; CHECK-BASELINE-NEXT: pushq %r15 +; CHECK-BASELINE-NEXT: pushq %r14 +; CHECK-BASELINE-NEXT: pushq %r13 +; CHECK-BASELINE-NEXT: pushq %r12 +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %al, %r9b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b +; CHECK-BASELINE-NEXT: xorb %al, %r9b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb %r10b, %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb %r10b, %dl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: xorb %r11b, %r10b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: xorb %r11b, %r10b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: xorb %bl, %r11b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: xorb %bl, %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %bpl, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %bpl, %bl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: xorb %r13b, %bpl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: xorb %r13b, %bpl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: xorb %r12b, %r13b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: xorb %r12b, %r13b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: xorb %r15b, %r12b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: xorb %r15b, %r12b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: xorb %r14b, %r15b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: xorb %r14b, %r15b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: xorb %sil, %r14b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: xorb %sil, %r14b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-BASELINE-NEXT: xorb %sil, %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %sil, %cl +; CHECK-BASELINE-NEXT: movb %cl, 15(%rdi) +; CHECK-BASELINE-NEXT: movb %al, 14(%rdi) +; CHECK-BASELINE-NEXT: movb %r14b, 13(%rdi) +; CHECK-BASELINE-NEXT: movb %r15b, 12(%rdi) +; CHECK-BASELINE-NEXT: movb %r12b, 11(%rdi) +; CHECK-BASELINE-NEXT: movb %r13b, 10(%rdi) +; CHECK-BASELINE-NEXT: movb %bpl, 9(%rdi) +; CHECK-BASELINE-NEXT: movb %bl, 8(%rdi) +; CHECK-BASELINE-NEXT: movb %r11b, 7(%rdi) +; CHECK-BASELINE-NEXT: movb %r10b, 6(%rdi) +; CHECK-BASELINE-NEXT: movb %dl, 5(%rdi) +; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: xorb %al, %r8b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: xorb %al, %r8b +; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdi) +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, 2(%rdi) +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, 1(%rdi) +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: popq %r12 +; CHECK-BASELINE-NEXT: popq %r13 +; CHECK-BASELINE-NEXT: popq %r14 +; CHECK-BASELINE-NEXT: popq %r15 +; CHECK-BASELINE-NEXT: popq %rbp +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v16i8: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: pushq %rbp +; CHECK-SSE1-NEXT: pushq %r15 +; CHECK-SSE1-NEXT: pushq %r14 +; CHECK-SSE1-NEXT: pushq %r13 +; CHECK-SSE1-NEXT: pushq %r12 +; CHECK-SSE1-NEXT: pushq %rbx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %al, %r9b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b +; CHECK-SSE1-NEXT: xorb %al, %r9b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb %r10b, %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb %r10b, %dl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: xorb %r11b, %r10b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: xorb %r11b, %r10b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: xorb %bl, %r11b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: xorb %bl, %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %bpl, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %bpl, %bl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: xorb %r13b, %bpl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: xorb %r13b, %bpl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: xorb %r12b, %r13b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: xorb %r12b, %r13b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: xorb %r15b, %r12b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: xorb %r15b, %r12b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: xorb %r14b, %r15b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: xorb %r14b, %r15b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: xorb %sil, %r14b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: xorb %sil, %r14b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %sil +; CHECK-SSE1-NEXT: xorb %sil, %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %sil, %cl +; CHECK-SSE1-NEXT: movb %cl, 15(%rdi) +; CHECK-SSE1-NEXT: movb %al, 14(%rdi) +; CHECK-SSE1-NEXT: movb %r14b, 13(%rdi) +; CHECK-SSE1-NEXT: movb %r15b, 12(%rdi) +; CHECK-SSE1-NEXT: movb %r12b, 11(%rdi) +; CHECK-SSE1-NEXT: movb %r13b, 10(%rdi) +; CHECK-SSE1-NEXT: movb %bpl, 9(%rdi) +; CHECK-SSE1-NEXT: movb %bl, 8(%rdi) +; CHECK-SSE1-NEXT: movb %r11b, 7(%rdi) +; CHECK-SSE1-NEXT: movb %r10b, 6(%rdi) +; CHECK-SSE1-NEXT: movb %dl, 5(%rdi) +; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: xorb %al, %r8b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: xorb %al, %r8b +; CHECK-SSE1-NEXT: movb %r8b, 3(%rdi) +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, 2(%rdi) +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, 1(%rdi) +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: popq %r12 +; CHECK-SSE1-NEXT: popq %r13 +; CHECK-SSE1-NEXT: popq %r14 +; CHECK-SSE1-NEXT: popq %r15 +; CHECK-SSE1-NEXT: popq %rbp +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v16i8: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v16i8: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %n0 = xor <16 x i8> %x, %y + %n1 = and <16 x i8> %n0, %mask + %r = xor <16 x i8> %n1, %y + ret <16 x i8> %r +} + +define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { +; CHECK-BASELINE-LABEL: in_v8i16: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbp +; CHECK-BASELINE-NEXT: pushq %r14 +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: xorl %r10d, %r9d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: xorl %r11d, %r8d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: xorl %eax, %ecx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorl %ebx, %esi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: xorl %ebx, %esi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorl %ebx, %edx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-BASELINE-NEXT: xorl %ebx, %edx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-BASELINE-NEXT: xorl %eax, %ecx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-BASELINE-NEXT: xorl %r11d, %r8d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w +; CHECK-BASELINE-NEXT: xorl %r10d, %r9d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; CHECK-BASELINE-NEXT: xorw %bx, %bp +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bp +; CHECK-BASELINE-NEXT: xorl %ebx, %ebp +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorw %ax, %bx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-BASELINE-NEXT: xorl %eax, %ebx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: xorw %r14w, %ax +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %ax +; CHECK-BASELINE-NEXT: xorl %r14d, %eax +; CHECK-BASELINE-NEXT: movw %ax, 14(%rdi) +; CHECK-BASELINE-NEXT: movw %bx, 12(%rdi) +; CHECK-BASELINE-NEXT: movw %bp, 10(%rdi) +; CHECK-BASELINE-NEXT: movw %r9w, 8(%rdi) +; CHECK-BASELINE-NEXT: movw %r8w, 6(%rdi) +; CHECK-BASELINE-NEXT: movw %cx, 4(%rdi) +; CHECK-BASELINE-NEXT: movw %dx, 2(%rdi) +; CHECK-BASELINE-NEXT: movw %si, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: popq %r14 +; CHECK-BASELINE-NEXT: popq %rbp +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v8i16: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: pushq %rbp +; CHECK-SSE1-NEXT: pushq %r14 +; CHECK-SSE1-NEXT: pushq %rbx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: xorl %r10d, %r9d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: xorl %r11d, %r8d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: xorl %eax, %ecx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorl %ebx, %esi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: xorl %ebx, %esi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorl %ebx, %edx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-SSE1-NEXT: xorl %ebx, %edx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-SSE1-NEXT: xorl %eax, %ecx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-SSE1-NEXT: xorl %r11d, %r8d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w +; CHECK-SSE1-NEXT: xorl %r10d, %r9d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; CHECK-SSE1-NEXT: xorw %bx, %bp +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bp +; CHECK-SSE1-NEXT: xorl %ebx, %ebp +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorw %ax, %bx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-SSE1-NEXT: xorl %eax, %ebx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: xorw %r14w, %ax +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %ax +; CHECK-SSE1-NEXT: xorl %r14d, %eax +; CHECK-SSE1-NEXT: movw %ax, 14(%rdi) +; CHECK-SSE1-NEXT: movw %bx, 12(%rdi) +; CHECK-SSE1-NEXT: movw %bp, 10(%rdi) +; CHECK-SSE1-NEXT: movw %r9w, 8(%rdi) +; CHECK-SSE1-NEXT: movw %r8w, 6(%rdi) +; CHECK-SSE1-NEXT: movw %cx, 4(%rdi) +; CHECK-SSE1-NEXT: movw %dx, 2(%rdi) +; CHECK-SSE1-NEXT: movw %si, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: popq %rbx +; CHECK-SSE1-NEXT: popq %r14 +; CHECK-SSE1-NEXT: popq %rbp +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v8i16: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v8i16: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %n0 = xor <8 x i16> %x, %y + %n1 = and <8 x i16> %n0, %mask + %r = xor <8 x i16> %n1, %y + ret <8 x i16> %r +} + +define <4 x i32> @in_v4i32(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) nounwind { +; CHECK-BASELINE-LABEL: in_v4i32: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: pushq %rbx +; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d +; CHECK-BASELINE-NEXT: movl 8(%rdx), %r9d +; CHECK-BASELINE-NEXT: movl (%rdx), %r11d +; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d +; CHECK-BASELINE-NEXT: movl (%rsi), %edx +; CHECK-BASELINE-NEXT: xorl %r11d, %edx +; CHECK-BASELINE-NEXT: movl 4(%rsi), %eax +; CHECK-BASELINE-NEXT: xorl %r10d, %eax +; CHECK-BASELINE-NEXT: movl 8(%rsi), %ebx +; CHECK-BASELINE-NEXT: xorl %r9d, %ebx +; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi +; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi +; CHECK-BASELINE-NEXT: andl 8(%rcx), %ebx +; CHECK-BASELINE-NEXT: andl 4(%rcx), %eax +; CHECK-BASELINE-NEXT: andl (%rcx), %edx +; CHECK-BASELINE-NEXT: xorl %r11d, %edx +; CHECK-BASELINE-NEXT: xorl %r10d, %eax +; CHECK-BASELINE-NEXT: xorl %r9d, %ebx +; CHECK-BASELINE-NEXT: xorl %r8d, %esi +; CHECK-BASELINE-NEXT: movl %esi, 12(%rdi) +; CHECK-BASELINE-NEXT: movl %ebx, 8(%rdi) +; CHECK-BASELINE-NEXT: movl %eax, 4(%rdi) +; CHECK-BASELINE-NEXT: movl %edx, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: popq %rbx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v4i32: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andps (%rcx), %xmm1 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v4i32: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1 +; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v4i32: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 +; CHECK-XOP-NEXT: vxorps (%rdi), %xmm0, %xmm1 +; CHECK-XOP-NEXT: vandps (%rdx), %xmm1, %xmm1 +; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; CHECK-XOP-NEXT: retq + %x = load <4 x i32>, <4 x i32> *%px, align 16 + %y = load <4 x i32>, <4 x i32> *%py, align 16 + %mask = load <4 x i32>, <4 x i32> *%pmask, align 16 + %n0 = xor <4 x i32> %x, %y + %n1 = and <4 x i32> %n0, %mask + %r = xor <4 x i32> %n1, %y + ret <4 x i32> %r +} + +define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { +; CHECK-BASELINE-LABEL: in_v2i64: +; CHECK-BASELINE: # %bb.0: +; CHECK-BASELINE-NEXT: xorq %rdx, %rdi +; CHECK-BASELINE-NEXT: xorq %rcx, %rsi +; CHECK-BASELINE-NEXT: andq %r9, %rsi +; CHECK-BASELINE-NEXT: andq %r8, %rdi +; CHECK-BASELINE-NEXT: xorq %rdx, %rdi +; CHECK-BASELINE-NEXT: xorq %rcx, %rsi +; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: movq %rsi, %rdx +; CHECK-BASELINE-NEXT: retq +; +; CHECK-SSE1-LABEL: in_v2i64: +; CHECK-SSE1: # %bb.0: +; CHECK-SSE1-NEXT: xorq %rdx, %rdi +; CHECK-SSE1-NEXT: xorq %rcx, %rsi +; CHECK-SSE1-NEXT: andq %r9, %rsi +; CHECK-SSE1-NEXT: andq %r8, %rdi +; CHECK-SSE1-NEXT: xorq %rdx, %rdi +; CHECK-SSE1-NEXT: xorq %rcx, %rsi +; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movq %rsi, %rdx +; CHECK-SSE1-NEXT: retq +; +; CHECK-SSE2-LABEL: in_v2i64: +; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: andps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-XOP-LABEL: in_v2i64: +; CHECK-XOP: # %bb.0: +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-XOP-NEXT: retq + %n0 = xor <2 x i64> %x, %y + %n1 = and <2 x i64> %n0, %mask + %r = xor <2 x i64> %n1, %y + ret <2 x i64> %r +}