diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23108,8 +23108,8 @@ static SDValue combineShuffleToZeroExtendVectorInReg(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, const TargetLowering &TLI, + bool LegalTypes, bool LegalOperations) { - bool LegalTypes = true; EVT VT = SVN->getValueType(0); assert(!VT.isScalableVector() && "Encountered scalable shuffle?"); unsigned NumElts = VT.getVectorNumElements(); @@ -24203,8 +24203,8 @@ // Perform this really late, because it could eliminate knowledge // of undef elements created by this shuffle. if (Level < AfterLegalizeTypes) - if (SDValue V = combineShuffleToZeroExtendVectorInReg(SVN, DAG, TLI, - LegalOperations)) + if (SDValue V = combineShuffleToZeroExtendVectorInReg( + SVN, DAG, TLI, LegalTypes, LegalOperations)) return V; return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -o - %s | FileCheck %s +; RUN: llc -o - %s | FileCheck --implicit-check-not=LCPI --implicit-check-not=lCPI %s target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios" @@ -438,69 +438,25 @@ ret void } -; CHECK-LABEL: lCPI8_0: -; CHECK-NEXT: .byte 4 ; 0x4 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 5 ; 0x5 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 6 ; 0x6 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 7 ; 0x7 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: lCPI8_1: -; CHECK-NEXT: .byte 0 ; 0x0 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 1 ; 0x1 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 2 ; 0x2 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 3 ; 0x3 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff - define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) { ; CHECK-LABEL: uitofp_v8i8_to_v8f32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh10: -; CHECK-NEXT: adrp x9, lCPI8_0@PAGE -; CHECK-NEXT: Lloh11: -; CHECK-NEXT: adrp x10, lCPI8_1@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh12: -; CHECK-NEXT: ldr q0, [x9, lCPI8_0@PAGEOFF] -; CHECK-NEXT: Lloh13: -; CHECK-NEXT: ldr q1, [x10, lCPI8_1@PAGEOFF] ; CHECK-NEXT: LBB8_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d2, [x0, x8, lsl #3] +; CHECK-NEXT: ldr d0, [x0, x8, lsl #3] ; CHECK-NEXT: add x9, x1, x8, lsl #5 ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 -; CHECK-NEXT: tbl.16b v3, { v2 }, v0 -; CHECK-NEXT: tbl.16b v2, { v2 }, v1 -; CHECK-NEXT: ucvtf.4s v3, v3 -; CHECK-NEXT: ucvtf.4s v2, v2 -; CHECK-NEXT: stp q2, q3, [x9] +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll2.4s v1, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: ucvtf.4s v1, v1 +; CHECK-NEXT: ucvtf.4s v0, v0 +; CHECK-NEXT: stp q0, q1, [x9] ; CHECK-NEXT: b.eq LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh13 -; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh12 entry: br label %loop @@ -519,118 +475,31 @@ ret void } -; CHECK-LABEL: lCPI9_0: -; CHECK-NEXT: .byte 12 ; 0xc -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 13 ; 0xd -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 14 ; 0xe -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 15 ; 0xf -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: lCPI9_1: -; CHECK-NEXT: .byte 8 ; 0x8 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 9 ; 0x9 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 10 ; 0xa -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 11 ; 0xb -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: lCPI9_2: -; CHECK-NEXT: .byte 4 ; 0x4 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 5 ; 0x5 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 6 ; 0x6 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 7 ; 0x7 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: lCPI9_3: -; CHECK-NEXT: .byte 0 ; 0x0 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 1 ; 0x1 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 2 ; 0x2 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 3 ; 0x3 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff - define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) { ; CHECK-LABEL: uitofp_v16i8_to_v16f32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh14: -; CHECK-NEXT: adrp x9, lCPI9_0@PAGE -; CHECK-NEXT: Lloh15: -; CHECK-NEXT: adrp x10, lCPI9_1@PAGE -; CHECK-NEXT: Lloh16: -; CHECK-NEXT: adrp x11, lCPI9_2@PAGE -; CHECK-NEXT: Lloh17: -; CHECK-NEXT: adrp x12, lCPI9_3@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh18: -; CHECK-NEXT: ldr q0, [x9, lCPI9_0@PAGEOFF] -; CHECK-NEXT: Lloh19: -; CHECK-NEXT: ldr q1, [x10, lCPI9_1@PAGEOFF] -; CHECK-NEXT: Lloh20: -; CHECK-NEXT: ldr q2, [x11, lCPI9_2@PAGEOFF] -; CHECK-NEXT: Lloh21: -; CHECK-NEXT: ldr q3, [x12, lCPI9_3@PAGEOFF] ; CHECK-NEXT: LBB9_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q4, [x0, x8, lsl #4] +; CHECK-NEXT: ldr q0, [x0, x8, lsl #4] ; CHECK-NEXT: add x9, x1, x8, lsl #6 ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 -; CHECK-NEXT: tbl.16b v5, { v4 }, v0 -; CHECK-NEXT: tbl.16b v6, { v4 }, v1 -; CHECK-NEXT: tbl.16b v7, { v4 }, v2 -; CHECK-NEXT: tbl.16b v4, { v4 }, v3 -; CHECK-NEXT: ucvtf.4s v5, v5 -; CHECK-NEXT: ucvtf.4s v6, v6 -; CHECK-NEXT: ucvtf.4s v7, v7 -; CHECK-NEXT: ucvtf.4s v4, v4 -; CHECK-NEXT: stp q6, q5, [x9, #32] -; CHECK-NEXT: stp q4, q7, [x9] +; CHECK-NEXT: ushll2.8h v1, v0, #0 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll2.4s v2, v1, #0 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: ushll2.4s v3, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: ucvtf.4s v2, v2 +; CHECK-NEXT: ucvtf.4s v1, v1 +; CHECK-NEXT: ucvtf.4s v3, v3 +; CHECK-NEXT: ucvtf.4s v0, v0 +; CHECK-NEXT: stp q1, q2, [x9, #32] +; CHECK-NEXT: stp q0, q3, [x9] ; CHECK-NEXT: b.eq LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh17, Lloh21 -; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh20 -; CHECK-NEXT: .loh AdrpLdr Lloh15, Lloh19 -; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh18 entry: br label %loop diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll --- a/llvm/test/CodeGen/AArch64/vselect-ext.ll +++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll @@ -573,53 +573,35 @@ define void @extension_in_loop_v16i8_to_v16i32(ptr %src, ptr %dst) { ; CHECK-LABEL: extension_in_loop_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh2: -; CHECK-NEXT: adrp x9, lCPI24_0@PAGE -; CHECK-NEXT: Lloh3: -; CHECK-NEXT: adrp x10, lCPI24_1@PAGE -; CHECK-NEXT: Lloh4: -; CHECK-NEXT: adrp x11, lCPI24_2@PAGE -; CHECK-NEXT: Lloh5: -; CHECK-NEXT: adrp x12, lCPI24_3@PAGE -; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff +; CHECK-NEXT: movi.2d v0, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh6: -; CHECK-NEXT: ldr q0, [x9, lCPI24_0@PAGEOFF] -; CHECK-NEXT: Lloh7: -; CHECK-NEXT: ldr q1, [x10, lCPI24_1@PAGEOFF] -; CHECK-NEXT: Lloh8: -; CHECK-NEXT: ldr q3, [x11, lCPI24_2@PAGEOFF] -; CHECK-NEXT: Lloh9: -; CHECK-NEXT: ldr q4, [x12, lCPI24_3@PAGEOFF] ; CHECK-NEXT: LBB24_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q5, [x0, x8] +; CHECK-NEXT: ldr q1, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: cmgt.16b v6, v5, v2 -; CHECK-NEXT: tbl.16b v7, { v5 }, v0 -; CHECK-NEXT: tbl.16b v16, { v5 }, v1 -; CHECK-NEXT: sshll2.8h v18, v6, #0 -; CHECK-NEXT: tbl.16b v17, { v5 }, v3 -; CHECK-NEXT: sshll2.4s v19, v18, #0 -; CHECK-NEXT: sshll.4s v18, v18, #0 -; CHECK-NEXT: tbl.16b v5, { v5 }, v4 -; CHECK-NEXT: sshll.8h v6, v6, #0 -; CHECK-NEXT: and.16b v7, v7, v19 -; CHECK-NEXT: and.16b v16, v16, v18 -; CHECK-NEXT: stp q16, q7, [x1, #32] -; CHECK-NEXT: sshll2.4s v7, v6, #0 -; CHECK-NEXT: sshll.4s v6, v6, #0 -; CHECK-NEXT: and.16b v7, v17, v7 +; CHECK-NEXT: cmgt.16b v2, v1, v0 +; CHECK-NEXT: ushll2.8h v3, v1, #0 +; CHECK-NEXT: sshll2.8h v4, v2, #0 +; CHECK-NEXT: ushll2.4s v5, v3, #0 +; CHECK-NEXT: ushll.4s v3, v3, #0 +; CHECK-NEXT: sshll2.4s v6, v4, #0 +; CHECK-NEXT: sshll.4s v4, v4, #0 +; CHECK-NEXT: ushll.8h v1, v1, #0 +; CHECK-NEXT: sshll.8h v2, v2, #0 ; CHECK-NEXT: and.16b v5, v5, v6 -; CHECK-NEXT: stp q5, q7, [x1], #64 +; CHECK-NEXT: and.16b v3, v3, v4 +; CHECK-NEXT: stp q3, q5, [x1, #32] +; CHECK-NEXT: sshll2.4s v4, v2, #0 +; CHECK-NEXT: sshll.4s v2, v2, #0 +; CHECK-NEXT: ushll2.4s v3, v1, #0 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: and.16b v3, v3, v4 +; CHECK-NEXT: and.16b v1, v1, v2 +; CHECK-NEXT: stp q1, q3, [x1], #64 ; CHECK-NEXT: b.ne LBB24_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9 -; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8 -; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 -; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 entry: br label %loop @@ -643,23 +625,23 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(ptr %src, ptr %dst) { ; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh10: +; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x9, lCPI25_0@PAGE -; CHECK-NEXT: Lloh11: +; CHECK-NEXT: Lloh3: ; CHECK-NEXT: adrp x10, lCPI25_1@PAGE -; CHECK-NEXT: Lloh12: +; CHECK-NEXT: Lloh4: ; CHECK-NEXT: adrp x11, lCPI25_2@PAGE -; CHECK-NEXT: Lloh13: +; CHECK-NEXT: Lloh5: ; CHECK-NEXT: adrp x12, lCPI25_3@PAGE ; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh14: +; CHECK-NEXT: Lloh6: ; CHECK-NEXT: ldr q0, [x9, lCPI25_0@PAGEOFF] -; CHECK-NEXT: Lloh15: +; CHECK-NEXT: Lloh7: ; CHECK-NEXT: ldr q1, [x10, lCPI25_1@PAGEOFF] -; CHECK-NEXT: Lloh16: +; CHECK-NEXT: Lloh8: ; CHECK-NEXT: ldr q3, [x11, lCPI25_2@PAGEOFF] -; CHECK-NEXT: Lloh17: +; CHECK-NEXT: Lloh9: ; CHECK-NEXT: ldr q4, [x12, lCPI25_3@PAGEOFF] ; CHECK-NEXT: LBB25_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -686,10 +668,10 @@ ; CHECK-NEXT: b.ne LBB25_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17 -; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16 -; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15 -; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14 +; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9 +; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8 +; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 entry: br label %loop @@ -714,23 +696,23 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(ptr %src, ptr %dst) { ; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh18: +; CHECK-NEXT: Lloh10: ; CHECK-NEXT: adrp x9, lCPI26_0@PAGE -; CHECK-NEXT: Lloh19: +; CHECK-NEXT: Lloh11: ; CHECK-NEXT: adrp x10, lCPI26_1@PAGE -; CHECK-NEXT: Lloh20: +; CHECK-NEXT: Lloh12: ; CHECK-NEXT: adrp x11, lCPI26_2@PAGE -; CHECK-NEXT: Lloh21: +; CHECK-NEXT: Lloh13: ; CHECK-NEXT: adrp x12, lCPI26_3@PAGE ; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh22: +; CHECK-NEXT: Lloh14: ; CHECK-NEXT: ldr q0, [x9, lCPI26_0@PAGEOFF] -; CHECK-NEXT: Lloh23: +; CHECK-NEXT: Lloh15: ; CHECK-NEXT: ldr q1, [x10, lCPI26_1@PAGEOFF] -; CHECK-NEXT: Lloh24: +; CHECK-NEXT: Lloh16: ; CHECK-NEXT: ldr q3, [x11, lCPI26_2@PAGEOFF] -; CHECK-NEXT: Lloh25: +; CHECK-NEXT: Lloh17: ; CHECK-NEXT: ldr q4, [x12, lCPI26_3@PAGEOFF] ; CHECK-NEXT: LBB26_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -757,10 +739,10 @@ ; CHECK-NEXT: b.ne LBB26_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh25 -; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh24 -; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23 -; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22 +; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17 +; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16 +; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15 +; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14 entry: br label %loop diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1,77 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck %s -; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s -; RUN: llc -mtriple=arm64-apple-ios -mattr=+global-isel -mattr=+sve -o - %s | FileCheck %s -; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+global-isel -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s - -; CHECK-LABEL: lCPI0_0: -; CHECK-NEXT: .byte 0 ; 0x0 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 1 ; 0x1 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 2 ; 0x2 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 3 ; 0x3 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT:lCPI0_1: -; CHECK-NEXT: .byte 4 ; 0x4 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 5 ; 0x5 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 6 ; 0x6 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 7 ; 0x7 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT:lCPI0_2: -; CHECK-NEXT: .byte 8 ; 0x8 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 9 ; 0x9 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 10 ; 0xa -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 11 ; 0xb -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT:lCPI0_3: -; CHECK-NEXT: .byte 12 ; 0xc -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 13 ; 0xd -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 14 ; 0xe -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 15 ; 0xf -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff +; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck --implicit-check-not=LCPI --implicit-check-not=lCPI %s +; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck --implicit-check-not=LCPI --implicit-check-not=lCPI --check-prefix=CHECK-BE %s +; RUN: llc -mtriple=arm64-apple-ios -mattr=+global-isel -mattr=+sve -o - %s | FileCheck --implicit-check-not=LCPI --implicit-check-not=lCPI %s +; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+global-isel -mattr=+sve -o - %s | FileCheck --implicit-check-not=LCPI --implicit-check-not=lCPI --check-prefix=CHECK-BE %s ; CHECK-BE: .LCPI0_0: ; CHECK-BE-NEXT: .byte 255 // 0xff @@ -147,41 +78,23 @@ define void @zext_v16i8_to_v16i32_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh0: -; CHECK-NEXT: adrp x9, lCPI0_0@PAGE -; CHECK-NEXT: Lloh1: -; CHECK-NEXT: adrp x10, lCPI0_1@PAGE -; CHECK-NEXT: Lloh2: -; CHECK-NEXT: adrp x11, lCPI0_2@PAGE -; CHECK-NEXT: Lloh3: -; CHECK-NEXT: adrp x12, lCPI0_3@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh4: -; CHECK-NEXT: ldr q0, [x9, lCPI0_0@PAGEOFF] -; CHECK-NEXT: Lloh5: -; CHECK-NEXT: ldr q1, [x10, lCPI0_1@PAGEOFF] -; CHECK-NEXT: Lloh6: -; CHECK-NEXT: ldr q2, [x11, lCPI0_2@PAGEOFF] -; CHECK-NEXT: Lloh7: -; CHECK-NEXT: ldr q3, [x12, lCPI0_3@PAGEOFF] ; CHECK-NEXT: LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q4, [x0, x8] +; CHECK-NEXT: ldr q0, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: tbl.16b v5, { v4 }, v3 -; CHECK-NEXT: tbl.16b v6, { v4 }, v2 -; CHECK-NEXT: tbl.16b v7, { v4 }, v1 -; CHECK-NEXT: tbl.16b v4, { v4 }, v0 -; CHECK-NEXT: stp q6, q5, [x1, #32] -; CHECK-NEXT: stp q4, q7, [x1], #64 +; CHECK-NEXT: ushll2.8h v1, v0, #0 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll2.4s v2, v1, #0 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: ushll2.4s v3, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: stp q1, q2, [x1, #32] +; CHECK-NEXT: stp q0, q3, [x1], #64 ; CHECK-NEXT: b.ne LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 -; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 -; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5 -; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4 ; ; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -549,41 +462,6 @@ ret void } -; CHECK-LABEL: lCPI6_0: -; CHECK-NEXT: .byte 0 ; 0x0 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 1 ; 0x1 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 2 ; 0x2 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 3 ; 0x3 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: lCPI6_1: -; CHECK-NEXT: .byte 4 ; 0x4 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 5 ; 0x5 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 6 ; 0x6 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 7 ; 0x7 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff - ; CHECK-BE: .LCPI6_0: ; CHECK-BE-NEXT: .byte 255 // 0xff ; CHECK-BE-NEXT: .byte 255 // 0xff @@ -622,28 +500,19 @@ define void @zext_v8i8_to_v8i32_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh8: -; CHECK-NEXT: adrp x9, lCPI6_0@PAGE -; CHECK-NEXT: Lloh9: -; CHECK-NEXT: adrp x10, lCPI6_1@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh10: -; CHECK-NEXT: ldr q0, [x9, lCPI6_0@PAGEOFF] -; CHECK-NEXT: Lloh11: -; CHECK-NEXT: ldr q1, [x10, lCPI6_1@PAGEOFF] ; CHECK-NEXT: LBB6_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d2, [x0, x8] +; CHECK-NEXT: ldr d0, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: tbl.16b v3, { v2 }, v1 -; CHECK-NEXT: tbl.16b v2, { v2 }, v0 -; CHECK-NEXT: stp q2, q3, [x1], #64 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll2.4s v1, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: stp q0, q1, [x1], #64 ; CHECK-NEXT: b.ne LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh11 -; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh10 ; ; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -993,6 +862,23 @@ ret void } +; CHECK-BE: .LCPI11_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 define void @zext_v4i8_to_v4i32_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v4i8_to_v4i32_in_loop: ; CHECK: ; %bb.0: ; %entry @@ -1027,8 +913,6 @@ ; CHECK-BE-NEXT: b.ne .LBB11_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret - - entry: br label %loop @@ -1047,58 +931,6 @@ ret void } -; CHECK-LABEL: lCPI12_0: -; CHECK-NEXT: .byte 0 ; 0x0 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 1 ; 0x1 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 2 ; 0x2 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 3 ; 0x3 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: lCPI12_1: -; CHECK-NEXT: .byte 4 ; 0x4 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 5 ; 0x5 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 6 ; 0x6 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 7 ; 0x7 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: lCPI12_2: -; CHECK-NEXT: .byte 8 ; 0x8 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 9 ; 0x9 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 10 ; 0xa -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 11 ; 0xb -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff - ; CHECK-BE-LABEL: .LCPI12_0: ; CHECK-BE-NEXT: .byte 255 // 0xff ; CHECK-BE-NEXT: .byte 255 // 0xff @@ -1154,35 +986,22 @@ define void @zext_v12i8_to_v12i32_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v12i8_to_v12i32_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh12: -; CHECK-NEXT: adrp x9, lCPI12_0@PAGE -; CHECK-NEXT: Lloh13: -; CHECK-NEXT: adrp x10, lCPI12_1@PAGE -; CHECK-NEXT: Lloh14: -; CHECK-NEXT: adrp x11, lCPI12_2@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh15: -; CHECK-NEXT: ldr q0, [x9, lCPI12_0@PAGEOFF] -; CHECK-NEXT: Lloh16: -; CHECK-NEXT: ldr q1, [x10, lCPI12_1@PAGEOFF] -; CHECK-NEXT: Lloh17: -; CHECK-NEXT: ldr q2, [x11, lCPI12_2@PAGEOFF] ; CHECK-NEXT: LBB12_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q3, [x0, x8] +; CHECK-NEXT: ldr q0, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: tbl.16b v4, { v3 }, v2 -; CHECK-NEXT: tbl.16b v5, { v3 }, v1 -; CHECK-NEXT: tbl.16b v3, { v3 }, v0 -; CHECK-NEXT: stp q5, q4, [x1, #16] -; CHECK-NEXT: str q3, [x1], #64 +; CHECK-NEXT: ushll2.8h v1, v0, #0 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: ushll2.4s v2, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: stp q2, q1, [x1, #16] +; CHECK-NEXT: str q0, [x1], #64 ; CHECK-NEXT: b.ne LBB12_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh17 -; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh16 -; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh15 ; ; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -1214,8 +1033,6 @@ ; CHECK-BE-NEXT: b.ne .LBB12_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret - - entry: br label %loop @@ -2152,22 +1969,22 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v20i8_to_v20i24_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh18: +; CHECK-NEXT: Lloh0: ; CHECK-NEXT: adrp x9, lCPI20_0@PAGE -; CHECK-NEXT: Lloh19: +; CHECK-NEXT: Lloh1: ; CHECK-NEXT: adrp x10, lCPI20_1@PAGE -; CHECK-NEXT: Lloh20: +; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x11, lCPI20_2@PAGE -; CHECK-NEXT: Lloh21: +; CHECK-NEXT: Lloh3: ; CHECK-NEXT: adrp x12, lCPI20_3@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh22: +; CHECK-NEXT: Lloh4: ; CHECK-NEXT: ldr q0, [x9, lCPI20_0@PAGEOFF] -; CHECK-NEXT: Lloh23: +; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr q1, [x10, lCPI20_1@PAGEOFF] -; CHECK-NEXT: Lloh24: +; CHECK-NEXT: Lloh6: ; CHECK-NEXT: ldr q2, [x11, lCPI20_2@PAGEOFF] -; CHECK-NEXT: Lloh25: +; CHECK-NEXT: Lloh7: ; CHECK-NEXT: ldr q3, [x12, lCPI20_3@PAGEOFF] ; CHECK-NEXT: LBB20_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2188,10 +2005,10 @@ ; CHECK-NEXT: b.ne LBB20_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh25 -; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh24 -; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23 -; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22 +; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 +; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5 +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4 ; ; CHECK-BE-LABEL: zext_v20i8_to_v20i24_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -2234,7 +2051,6 @@ ; CHECK-BE-NEXT: b.ne .LBB20_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret - entry: br label %loop @@ -2479,30 +2295,30 @@ define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v23i8_to_v23i48_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh26: +; CHECK-NEXT: Lloh8: ; CHECK-NEXT: adrp x9, lCPI21_0@PAGE -; CHECK-NEXT: Lloh27: +; CHECK-NEXT: Lloh9: ; CHECK-NEXT: adrp x10, lCPI21_1@PAGE -; CHECK-NEXT: Lloh28: +; CHECK-NEXT: Lloh10: ; CHECK-NEXT: adrp x11, lCPI21_2@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh29: +; CHECK-NEXT: Lloh11: ; CHECK-NEXT: ldr q0, [x9, lCPI21_0@PAGEOFF] -; CHECK-NEXT: Lloh30: +; CHECK-NEXT: Lloh12: ; CHECK-NEXT: adrp x9, lCPI21_3@PAGE -; CHECK-NEXT: Lloh31: +; CHECK-NEXT: Lloh13: ; CHECK-NEXT: ldr q1, [x10, lCPI21_1@PAGEOFF] -; CHECK-NEXT: Lloh32: +; CHECK-NEXT: Lloh14: ; CHECK-NEXT: adrp x10, lCPI21_4@PAGE -; CHECK-NEXT: Lloh33: +; CHECK-NEXT: Lloh15: ; CHECK-NEXT: ldr q2, [x11, lCPI21_2@PAGEOFF] -; CHECK-NEXT: Lloh34: +; CHECK-NEXT: Lloh16: ; CHECK-NEXT: adrp x11, lCPI21_5@PAGE -; CHECK-NEXT: Lloh35: +; CHECK-NEXT: Lloh17: ; CHECK-NEXT: ldr q3, [x9, lCPI21_3@PAGEOFF] -; CHECK-NEXT: Lloh36: +; CHECK-NEXT: Lloh18: ; CHECK-NEXT: ldr q4, [x10, lCPI21_4@PAGEOFF] -; CHECK-NEXT: Lloh37: +; CHECK-NEXT: Lloh19: ; CHECK-NEXT: ldr q5, [x11, lCPI21_5@PAGEOFF] ; CHECK-NEXT: LBB21_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2530,15 +2346,15 @@ ; CHECK-NEXT: b.ne LBB21_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh34, Lloh37 -; CHECK-NEXT: .loh AdrpLdr Lloh32, Lloh36 -; CHECK-NEXT: .loh AdrpLdr Lloh30, Lloh35 -; CHECK-NEXT: .loh AdrpAdrp Lloh28, Lloh34 -; CHECK-NEXT: .loh AdrpLdr Lloh28, Lloh33 -; CHECK-NEXT: .loh AdrpAdrp Lloh27, Lloh32 -; CHECK-NEXT: .loh AdrpLdr Lloh27, Lloh31 -; CHECK-NEXT: .loh AdrpAdrp Lloh26, Lloh30 -; CHECK-NEXT: .loh AdrpLdr Lloh26, Lloh29 +; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh19 +; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh18 +; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh17 +; CHECK-NEXT: .loh AdrpAdrp Lloh10, Lloh16 +; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh15 +; CHECK-NEXT: .loh AdrpAdrp Lloh9, Lloh14 +; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh13 +; CHECK-NEXT: .loh AdrpAdrp Lloh8, Lloh12 +; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh11 ; ; CHECK-BE-LABEL: zext_v23i8_to_v23i48_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -2605,9 +2421,6 @@ ; CHECK-BE-NEXT: b.ne .LBB21_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret - - - entry: br label %loop diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -1842,12 +1842,12 @@ ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -2187,12 +2187,12 @@ ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -2709,13 +2709,16 @@ ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v48i8_to_v24i16_factor2: @@ -2724,7 +2727,7 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) @@ -2738,7 +2741,7 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -2921,66 +2924,44 @@ ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: vec384_v48i8_to_v12i32_factor4: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v12i32_factor4: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX2-FAST-LABEL: vec384_v48i8_to_v12i32_factor4: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: vec384_v48i8_to_v12i32_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v12i32_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3160,65 +3141,43 @@ ; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: vec384_v48i8_to_v6i64_factor8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v6i64_factor8: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX2-FAST-LABEL: vec384_v48i8_to_v6i64_factor8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: vec384_v48i8_to_v6i64_factor8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v48i8_to_v6i64_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3381,29 +3340,37 @@ ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3411,14 +3378,18 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3426,14 +3397,18 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3441,30 +3416,54 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512BW-LABEL: vec384_v48i8_to_v3i128_factor16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-SLOW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_v48i8_to_v3i128_factor16: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-FAST-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,3,3,3,3,u,u,u,u] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-FAST-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -3678,15 +3677,15 @@ ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -3699,13 +3698,16 @@ ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v24i16_to_v12i32_factor2: @@ -3714,7 +3716,7 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) @@ -3726,13 +3728,12 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3740,10 +3741,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -3986,66 +3984,44 @@ ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v6i64_factor4: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX2-FAST-LABEL: vec384_v24i16_to_v6i64_factor4: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: vec384_v24i16_to_v6i64_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4053,9 +4029,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4297,65 +4271,74 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4364,11 +4347,14 @@ ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4381,10 +4367,15 @@ ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <4,u,u,u,5,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 @@ -4393,35 +4384,19 @@ ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec384_v24i16_to_v3i128_factor8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,40,17,18,19,20,21,22,23,41,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -4659,15 +4634,15 @@ ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -4677,20 +4652,19 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2: @@ -4724,10 +4698,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4956,54 +4927,60 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5011,11 +4988,13 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,3,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) @@ -5025,9 +5004,11 @@ ; ; AVX512F-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u> +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -5038,35 +5019,19 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: movb $17, %al -; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec384_v12i32_to_v3i128_factor4: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: movb $17, %al -; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec384_v12i32_to_v3i128_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -5337,17 +5302,21 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] -; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5355,11 +5324,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,3] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) @@ -5371,14 +5340,16 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,9,1,11,2,13,u,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: movb $85, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5386,12 +5357,11 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: movb $5, %al +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movb $85, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -5887,24 +5857,25 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: paddb 48(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: paddb 32(%rdx), %xmm4 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm4, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 48(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8: @@ -5913,20 +5884,19 @@ ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psrlq $48, %xmm3 -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrld $16, %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: psrld $16, %xmm2 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, 48(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v8i64_factor8: @@ -5936,9 +5906,9 @@ ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 @@ -5997,28 +5967,51 @@ } define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE-LABEL: vec512_v64i8_to_v4i128_factor16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb 48(%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm2 -; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm0, 16(%rcx) -; SSE-NEXT: retq +; SSE2-LABEL: vec512_v64i8_to_v4i128_factor16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrld $16, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb 48(%rdx), %xmm1 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, 48(%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec512_v64i8_to_v4i128_factor16: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0] +; SSE42-NEXT: pand %xmm0, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm3 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX: # %bb.0: @@ -6099,57 +6092,38 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq -; -; AVX512F-FAST-LABEL: vec512_v64i8_to_v4i128_factor16: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq +; AVX512F-LABEL: vec512_v64i8_to_v4i128_factor16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512BW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-SLOW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -6159,13 +6133,13 @@ ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,3,3,3,3,u,u,u,u] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-FAST-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper @@ -6372,16 +6346,16 @@ ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: paddb 48(%rdx), %xmm1 -; SSE42-NEXT: paddb 32(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) ; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) @@ -6525,8 +6499,8 @@ ; ; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -6577,19 +6551,18 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 48(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 48(%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq @@ -6599,21 +6572,21 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: paddb 16(%rdx), %xmm3 -; SSE42-NEXT: paddb 48(%rdx), %xmm2 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, 48(%rcx) -; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v4i128_factor8: @@ -6621,37 +6594,36 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper @@ -6659,17 +6631,17 @@ ; ; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -6677,17 +6649,17 @@ ; ; AVX2-FAST-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper @@ -6697,18 +6669,20 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -6716,25 +6690,30 @@ ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <4,u,u,u,5,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,40,17,18,19,20,21,22,23,41,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 @@ -6984,16 +6963,16 @@ ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: paddb 48(%rdx), %xmm1 -; SSE42-NEXT: paddb 32(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) ; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) @@ -7005,26 +6984,19 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: @@ -7081,22 +7053,23 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm4 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: paddb 48(%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, 48(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm4, 32(%rcx) +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; @@ -7126,40 +7099,40 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper @@ -7167,17 +7140,17 @@ ; ; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -7187,15 +7160,15 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,3,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper @@ -7203,31 +7176,31 @@ ; ; AVX512F-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: movb $17, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15] -; AVX512BW-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -7519,8 +7492,8 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3] @@ -7541,13 +7514,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,3] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper @@ -7557,6 +7530,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: movb $85, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} @@ -7568,37 +7543,19 @@ ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec512_v8i64_to_v4i128_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: movb $5, %al -; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec512_v8i64_to_v4i128_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: movb $5, %al -; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} -; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,5,3,7] -; AVX512BW-FAST-NEXT: vpermi2q %ymm2, %ymm0, %ymm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec512_v8i64_to_v4i128_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movb $85, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -7835,9 +7792,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa %xmm0, %xmm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: movb $51, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper