Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2765,7 +2765,7 @@ } // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) - // Only perform this optimization after type legalization and before + // Only perform this optimization up until type legalization, before // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and // we don't want to undo this promotion. @@ -2773,7 +2773,7 @@ // on scalars. if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR) && - Level == AfterLegalizeTypes) { + Level <= AfterLegalizeTypes) { SDValue In0 = N0.getOperand(0); SDValue In1 = N1.getOperand(0); EVT In0Ty = In0.getValueType(); Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -74,18 +74,18 @@ ret void } -define i16 @mand16(i16 %x, i16 %y) { +define i16 @mand16(<16 x i1>* %x, <16 x i1>* %y) { ; CHECK-LABEL: mand16: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: kmovw (%rdi), %k0 +; CHECK-NEXT: kmovw (%rsi), %k1 ; CHECK-NEXT: kandw %k1, %k0, %k2 ; CHECK-NEXT: kxorw %k1, %k0, %k0 ; CHECK-NEXT: korw %k0, %k2, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: retq - %ma = bitcast i16 %x to <16 x i1> - %mb = bitcast i16 %y to <16 x i1> + %ma = load <16 x i1>, <16 x i1>* %x + %mb = load <16 x i1>, <16 x i1>* %y %mc = and <16 x i1> %ma, %mb %md = xor <16 x i1> %ma, %mb %me = or <16 x i1> %mc, %md Index: test/CodeGen/X86/avx512-select.ll =================================================================== --- test/CodeGen/X86/avx512-select.ll +++ test/CodeGen/X86/avx512-select.ll @@ -68,31 +68,35 @@ ret <16 x double> %sel } -define i8 @select05(i8 %a.0, i8 %m) { +define i8 @select05(<8 x i1>* %a.0, <8 x i1>* %m) { ; CHECK-LABEL: select05: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k0 -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: movzbw (%rsi), %ax +; CHECK-NEXT: kmovw %eax, %k0 +; CHECK-NEXT: movzbw (%rdi), %ax +; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: retq - %mask = bitcast i8 %m to <8 x i1> - %a = bitcast i8 %a.0 to <8 x i1> + %mask = load <8 x i1> , <8 x i1>* %m + %a = load <8 x i1> , <8 x i1>* %a.0 %r = select <8 x i1> %mask, <8 x i1> , <8 x i1> %a %res = bitcast <8 x i1> %r to i8 ret i8 %res; } -define i8 @select06(i8 %a.0, i8 %m) { +define i8 @select06(<8 x i1>* %a.0, <8 x i1>* %m) { ; CHECK-LABEL: select06: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k0 -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: movzbw (%rsi), %ax +; CHECK-NEXT: kmovw %eax, %k0 +; CHECK-NEXT: movzbw (%rdi), %ax +; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: retq - %mask = bitcast i8 %m to <8 x i1> - %a = bitcast i8 %a.0 to <8 x i1> + %mask = load <8 x i1> , <8 x i1>* %m + %a = load <8 x i1> , <8 x i1>* %a.0 %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer %res = bitcast <8 x i1> %r to i8 ret i8 %res; Index: test/CodeGen/X86/avx512bw-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512bw-mask-op.ll +++ test/CodeGen/X86/avx512bw-mask-op.ll @@ -77,18 +77,18 @@ ret void } -define i32 @mand32(i32 %x, i32 %y) { +define i32 @mand32(<32 x i1>* %x, <32 x i1>* %y) { ; CHECK-LABEL: mand32: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovd %edi, %k0 -; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: kmovd (%rdi), %k0 +; CHECK-NEXT: kmovd (%rsi), %k1 ; CHECK-NEXT: kandd %k1, %k0, %k2 ; CHECK-NEXT: kxord %k1, %k0, %k0 ; CHECK-NEXT: kord %k0, %k2, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: retq - %ma = bitcast i32 %x to <32 x i1> - %mb = bitcast i32 %y to <32 x i1> + %ma = load <32 x i1>, <32 x i1>* %x + %mb = load <32 x i1>, <32 x i1>* %y %mc = and <32 x i1> %ma, %mb %md = xor <32 x i1> %ma, %mb %me = or <32 x i1> %mc, %md @@ -96,18 +96,18 @@ ret i32 %ret } -define i64 @mand64(i64 %x, i64 %y) { +define i64 @mand64(<64 x i1>* %x, <64 x i1>* %y) { ; CHECK-LABEL: mand64: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovq %rdi, %k0 -; CHECK-NEXT: kmovq %rsi, %k1 +; CHECK-NEXT: kmovq (%rdi), %k0 +; CHECK-NEXT: kmovq (%rsi), %k1 ; CHECK-NEXT: kandq %k1, %k0, %k2 ; CHECK-NEXT: kxorq %k1, %k0, %k0 ; CHECK-NEXT: korq %k0, %k2, %k0 ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: retq - %ma = bitcast i64 %x to <64 x i1> - %mb = bitcast i64 %y to <64 x i1> + %ma = load <64 x i1>, <64 x i1>* %x + %mb = load <64 x i1>, <64 x i1>* %y %mc = and <64 x i1> %ma, %mb %md = xor <64 x i1> %ma, %mb %me = or <64 x i1> %mc, %md Index: test/CodeGen/X86/avx512dq-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512dq-mask-op.ll +++ test/CodeGen/X86/avx512dq-mask-op.ll @@ -29,18 +29,18 @@ ret void } -define i8 @mand8(i8 %x, i8 %y) { +define i8 @mand8(<8 x i1>* %x, <8 x i1>* %y) { ; CHECK-LABEL: mand8: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k0 -; CHECK-NEXT: kmovb %esi, %k1 +; CHECK-NEXT: kmovb (%rdi), %k0 +; CHECK-NEXT: kmovb (%rsi), %k1 ; CHECK-NEXT: kandb %k1, %k0, %k2 ; CHECK-NEXT: kxorb %k1, %k0, %k0 ; CHECK-NEXT: korb %k0, %k2, %k0 ; CHECK-NEXT: kmovb %k0, %eax ; CHECK-NEXT: retq - %ma = bitcast i8 %x to <8 x i1> - %mb = bitcast i8 %y to <8 x i1> + %ma = load <8 x i1>, <8 x i1>* %x + %mb = load <8 x i1>, <8 x i1>* %y %mc = and <8 x i1> %ma, %mb %md = xor <8 x i1> %ma, %mb %me = or <8 x i1> %mc, %md Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1869,48 +1869,34 @@ define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: mask_v4f32_0127: ; SSE2: # BB#0: -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: mask_v4f32_0127: ; SSE3: # BB#0: -; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE3-NEXT: orps %xmm1, %xmm0 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: mask_v4f32_0127: ; SSSE3: # BB#0: -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mask_v4f32_0127: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX1-LABEL: mask_v4f32_0127: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: mask_v4f32_0127: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: mask_v4f32_0127: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX-NEXT: retq %1 = bitcast <4 x float> %a to <2 x i64> %2 = bitcast <4 x float> %b to <2 x i64> %3 = and <2 x i64> %1, @@ -1923,47 +1909,38 @@ define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: mask_v4i32_0127: ; SSE2: # BB#0: -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: mask_v4i32_0127: ; SSE3: # BB#0: -; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE3-NEXT: orps %xmm1, %xmm0 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: mask_v4i32_0127: ; SSSE3: # BB#0: -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mask_v4i32_0127: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: mask_v4i32_0127: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: mask_v4i32_0127: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-NEXT: retq %1 = bitcast <4 x i32> %a to <2 x i64> %2 = bitcast <4 x i32> %b to <2 x i64> Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2140,40 +2140,31 @@ define <8 x i16> @mask_v8i16_012345ef(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: mask_v8i16_012345ef: ; SSE2: # BB#0: -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: mask_v8i16_012345ef: ; SSSE3: # BB#0: -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mask_v8i16_012345ef: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: mask_v8i16_012345ef: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: mask_v8i16_012345ef: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-NEXT: retq %1 = bitcast <8 x i16> %a to <2 x i64> %2 = bitcast <8 x i16> %b to <2 x i64>