Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -5412,7 +5412,7 @@ llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], + Intrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; } // FP logical ops Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19210,7 +19210,8 @@ // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros); + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } @@ -19226,9 +19227,10 @@ // Do the horizontal sums into two v2i64s. Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); - High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -7326,32 +7326,34 @@ multiclass avx512_psadbw_packed opc, SDNode OpNode, - string OpcodeStr, X86VectorVTInfo _src>{ + string OpcodeStr, X86VectorVTInfo _dst, + X86VectorVTInfo _src>{ def rr : AVX512BI; + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT _src.RC:$src2))))]>; let mayLoad = 1 in def rm : AVX512BI; } multiclass avx512_psadbw_packed_all opc, SDNode OpNode, string OpcodeStr, Predicate prd> { let Predicates = [prd] in - defm Z512 : avx512_psadbw_packed, - EVEX_V512; + defm Z512 : avx512_psadbw_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_psadbw_packed, - EVEX_V256; - defm Z128 : avx512_psadbw_packed, - EVEX_V128; + defm Z256 : avx512_psadbw_packed, EVEX_V256; + defm Z128 : avx512_psadbw_packed, EVEX_V128; } } Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -79,8 +79,8 @@ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86psadbw : SDNode<"X86ISD::PSADBW", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>, SDTCisInt<3>]>>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4082,22 +4082,18 @@ int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; -defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, - int_x86_avx2_psad_bw, SSE_PMADD, 1>; - -let Predicates = [HasAVX2] in - def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1), - (v32i8 VR256:$src2))), - (VPSADBWYrr VR256:$src2, VR256:$src1)>; let Predicates = [HasAVX] in - def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (VPSADBWrr VR128:$src2, VR128:$src1)>; - -def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (PSADBWrr VR128:$src2, VR128:$src1)>; +defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V; +let Predicates = [HasAVX2] in +defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, + loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in +defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, + loadv2i64, i128mem, SSE_INTALU_ITINS_P, 1>; let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -282,6 +282,7 @@ X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), @@ -1694,6 +1695,7 @@ X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0), X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0), X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0), Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1255,15 +1255,15 @@ %res2 = add <8 x i64> %res, %res1 ret <8 x i64> %res2 } -declare <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) +declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) ; CHECK-LABEL: @test_int_x86_avx512_mask_psadb_w_512 ; CHECK-NOT: call ; CHECK: vpsadbw %zmm1 ; CHECK: vpsadbw %zmm2 -define <64 x i8>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){ - %res = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1) - %res1 = call <64 x i8> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 +define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){ + %res = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1) + %res1 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 } Index: test/CodeGen/X86/vector-popcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-128.ll +++ test/CodeGen/X86/vector-popcnt-128.ll @@ -91,7 +91,7 @@ ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) ret <2 x i64> %out @@ -206,9 +206,9 @@ ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) Index: test/CodeGen/X86/vector-popcnt-256.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-256.ll +++ test/CodeGen/X86/vector-popcnt-256.ll @@ -14,14 +14,14 @@ ; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -36,7 +36,7 @@ ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) ret <4 x i64> %out @@ -56,9 +56,9 @@ ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsadbw %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 @@ -67,9 +67,9 @@ ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -86,9 +86,9 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) Index: test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-128.ll +++ test/CodeGen/X86/vector-tzcnt-128.ll @@ -274,9 +274,9 @@ ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -296,9 +296,9 @@ ; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) @@ -429,9 +429,9 @@ ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -451,9 +451,9 @@ ; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) Index: test/CodeGen/X86/vector-tzcnt-256.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-256.ll +++ test/CodeGen/X86/vector-tzcnt-256.ll @@ -21,7 +21,7 @@ ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -29,7 +29,7 @@ ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -48,7 +48,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0) ret <4 x i64> %out @@ -74,7 +74,7 @@ ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -82,7 +82,7 @@ ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -101,7 +101,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1) ret <4 x i64> %out @@ -128,9 +128,9 @@ ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 @@ -140,9 +140,9 @@ ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -163,9 +163,9 @@ ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0) @@ -193,9 +193,9 @@ ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 @@ -205,9 +205,9 @@ ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -228,9 +228,9 @@ ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)