Skip to content

Commit

Permalink
[x86] split 256-bit store of concatenated vectors
Browse files Browse the repository at this point in the history
This shows up as a side issue to the main problem for the AVX target example from PR37428:
https://bugs.llvm.org/show_bug.cgi?id=37428 - https://godbolt.org/z/7tpRa3

But as we can see in the pile of existing test diffs, it's actually a widespread problem
that affects any AVX or later target. Apart from a couple of oddballs, I think these are
all improvements for the reasons stated in the code comment: we do not want to enable YMM
unnecessarily (avoid vzeroupper and frequency throttling) and some cores split 256-bit
stores anyway.

We could say that MergeConsecutiveStores() is going overboard on some of these examples,
but that won't solve the problem completely. But that is a reason I'm proposing this as
a lowering rather than a combine: we will infinite loop fighting the merge code if we try
this earlier.

Differential Revision: https://reviews.llvm.org/D62498

llvm-svn: 362524
rotateright committed Jun 4, 2019
1 parent f15e3d8 commit 606eb23
Showing 25 changed files with 786 additions and 845 deletions.
11 changes: 11 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
@@ -1283,6 +1283,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
}

if (HasInt256)
@@ -21073,7 +21074,17 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
if (St->isTruncatingStore())
return SDValue();

// If this is a 256-bit store of concatenated ops, we are better off splitting
// that store into two 128-bit stores. This avoids spurious use of 256-bit ops
// and each half can execute independently. Some cores would split the op into
// halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
if (StoreVT.is256BitVector()) {
if (StoredVal.getOpcode() != ISD::CONCAT_VECTORS || !StoredVal.hasOneUse())
return SDValue();
return split256BitStore(St, DAG);
}

assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
"Unexpected VT");
if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
402 changes: 192 additions & 210 deletions llvm/test/CodeGen/X86/avg.ll

Large diffs are not rendered by default.

24 changes: 12 additions & 12 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
Original file line number Diff line number Diff line change
@@ -725,12 +725,12 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
; X86-AVX-LABEL: test_x86_avx_storeu_dq_256:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 # encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
; X86-AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
; X86-AVX-NEXT: vpsubb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xf8,0xca]
; X86-AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc2]
; X86-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
; X86-AVX-NEXT: vmovups %ymm0, (%eax) # encoding: [0xc5,0xfc,0x11,0x00]
; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm2 # encoding: [0xc5,0xf9,0xf8,0xd1]
; X86-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc1]
; X86-AVX-NEXT: vmovdqu %xmm0, 16(%eax) # encoding: [0xc5,0xfa,0x7f,0x40,0x10]
; X86-AVX-NEXT: vmovdqu %xmm2, (%eax) # encoding: [0xc5,0xfa,0x7f,0x10]
; X86-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
@@ -745,12 +745,12 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
;
; X64-AVX-LABEL: test_x86_avx_storeu_dq_256:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 # encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
; X64-AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
; X64-AVX-NEXT: vpsubb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xf8,0xca]
; X64-AVX-NEXT: vpsubb %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc2]
; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
; X64-AVX-NEXT: vmovups %ymm0, (%rdi) # encoding: [0xc5,0xfc,0x11,0x07]
; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm2 # encoding: [0xc5,0xf9,0xf8,0xd1]
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc1]
; X64-AVX-NEXT: vmovdqu %xmm0, 16(%rdi) # encoding: [0xc5,0xfa,0x7f,0x47,0x10]
; X64-AVX-NEXT: vmovdqu %xmm2, (%rdi) # encoding: [0xc5,0xfa,0x7f,0x17]
; X64-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
Original file line number Diff line number Diff line change
@@ -916,33 +916,29 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
; X86-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfb,0xc1]
; X86-AVX-NEXT: vmovntdq %ymm0, (%eax) # encoding: [0xc5,0xfd,0xe7,0x00]
; X86-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-AVX-NEXT: vmovntdq %xmm0, (%eax) # encoding: [0xc5,0xf9,0xe7,0x00]
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512VL-LABEL: movnt_dq:
; X86-AVX512VL: # %bb.0:
; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
; X86-AVX512VL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
; X86-AVX512VL-NEXT: vmovntdq %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00]
; X86-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-AVX512VL-NEXT: vmovntdq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x00]
; X86-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-AVX-LABEL: movnt_dq:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
; X64-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfb,0xc1]
; X64-AVX-NEXT: vmovntdq %ymm0, (%rdi) # encoding: [0xc5,0xfd,0xe7,0x07]
; X64-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi) # encoding: [0xc5,0xf9,0xe7,0x07]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: movnt_dq:
; X64-AVX512VL: # %bb.0:
; X64-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
; X64-AVX512VL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
; X64-AVX512VL-NEXT: vmovntdq %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x07]
; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-AVX512VL-NEXT: vmovntdq %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x07]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
%a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
16 changes: 7 additions & 9 deletions llvm/test/CodeGen/X86/avx512-trunc-widen.ll
Original file line number Diff line number Diff line change
@@ -462,12 +462,10 @@ define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
; KNL-LABEL: trunc_wb_512_mem:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vmovdqa %ymm0, (%rdi)
; KNL-NEXT: vpmovdb %zmm1, 16(%rdi)
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; KNL-NEXT: vpmovdb %zmm0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
@@ -672,8 +670,8 @@ define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
; ALL: ## %bb.0:
; ALL-NEXT: vpmovusdb %zmm0, %xmm0
; ALL-NEXT: vpmovusdb %zmm1, %xmm1
; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; ALL-NEXT: vmovdqu %ymm0, (%rdi)
; ALL-NEXT: vmovdqu %xmm1, 16(%rdi)
; ALL-NEXT: vmovdqu %xmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -952,8 +950,8 @@ define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0
; ALL-NEXT: vpmovusdb %zmm0, %xmm0
; ALL-NEXT: vpmovusdb %zmm1, %xmm1
; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; ALL-NEXT: vmovdqu %ymm0, (%rdi)
; ALL-NEXT: vmovdqu %xmm1, 16(%rdi)
; ALL-NEXT: vmovdqu %xmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
16 changes: 7 additions & 9 deletions llvm/test/CodeGen/X86/avx512-trunc.ll
Original file line number Diff line number Diff line change
@@ -458,12 +458,10 @@ define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
; KNL-LABEL: trunc_wb_512_mem:
; KNL: ## %bb.0:
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vmovdqa %ymm0, (%rdi)
; KNL-NEXT: vpmovdb %zmm1, 16(%rdi)
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; KNL-NEXT: vpmovdb %zmm0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
@@ -667,8 +665,8 @@ define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
; ALL: ## %bb.0:
; ALL-NEXT: vpmovusdb %zmm0, %xmm0
; ALL-NEXT: vpmovusdb %zmm1, %xmm1
; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; ALL-NEXT: vmovdqu %ymm0, (%rdi)
; ALL-NEXT: vmovdqu %xmm1, 16(%rdi)
; ALL-NEXT: vmovdqu %xmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -948,8 +946,8 @@ define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0
; ALL-NEXT: vpmovusdb %zmm0, %xmm0
; ALL-NEXT: vpmovusdb %zmm1, %xmm1
; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; ALL-NEXT: vmovdqu %ymm0, (%rdi)
; ALL-NEXT: vmovdqu %xmm1, 16(%rdi)
; ALL-NEXT: vmovdqu %xmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/X86/nontemporal-2.ll
Original file line number Diff line number Diff line change
@@ -1061,12 +1061,12 @@ define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
;
; AVX1-LABEL: test_op_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovntps %ymm0, (%rdi)
; AVX1-NEXT: vmovntdq %xmm0, 16(%rdi)
; AVX1-NEXT: vmovntdq %xmm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1126,12 +1126,12 @@ define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
;
; AVX1-LABEL: test_op_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovntps %ymm0, (%rdi)
; AVX1-NEXT: vmovntdq %xmm0, 16(%rdi)
; AVX1-NEXT: vmovntdq %xmm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1164,12 +1164,12 @@ define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
;
; AVX1-LABEL: test_op_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovntps %ymm0, (%rdi)
; AVX1-NEXT: vmovntdq %xmm0, 16(%rdi)
; AVX1-NEXT: vmovntdq %xmm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1202,12 +1202,12 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
;
; AVX1-LABEL: test_op_v32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovntps %ymm0, (%rdi)
; AVX1-NEXT: vmovntdq %xmm0, 16(%rdi)
; AVX1-NEXT: vmovntdq %xmm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
15 changes: 8 additions & 7 deletions llvm/test/CodeGen/X86/oddsubvector.ll
Original file line number Diff line number Diff line change
@@ -116,13 +116,14 @@ define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %stru
;
; AVX-LABEL: PR40815:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps 16(%rdi), %xmm0
; AVX-NEXT: vmovaps 48(%rdi), %xmm1
; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1
; AVX-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm1, (%rsi)
; AVX-NEXT: vmovups %ymm0, 32(%rsi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vmovaps 16(%rdi), %xmm1
; AVX-NEXT: vmovaps 32(%rdi), %xmm2
; AVX-NEXT: vmovaps 48(%rdi), %xmm3
; AVX-NEXT: vmovaps %xmm2, 16(%rsi)
; AVX-NEXT: vmovaps %xmm3, (%rsi)
; AVX-NEXT: vmovaps %xmm0, 48(%rsi)
; AVX-NEXT: vmovaps %xmm1, 32(%rsi)
; AVX-NEXT: retq
;
; AVX512-LABEL: PR40815:
72 changes: 36 additions & 36 deletions llvm/test/CodeGen/X86/pmovsx-inreg.ll
Original file line number Diff line number Diff line change
@@ -53,12 +53,12 @@ define void @test2(<4 x i8>* %in, <4 x i64>* %out) nounwind {
;
; AVX1-LABEL: test2:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0
; AVX1-NEXT: vpmovsxbq (%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vpmovsxbq (%rdi), %xmm0
; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm1
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vmovups %ymm2, (%rax)
; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -134,12 +134,12 @@ define void @test4(<8 x i8>* %in, <8 x i32>* %out) nounwind {
;
; AVX1-LABEL: test4:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0
; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm1
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vmovups %ymm2, (%rax)
; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -215,12 +215,12 @@ define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind {
;
; AVX1-LABEL: test6:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm0
; AVX1-NEXT: vpmovsxbw (%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vmovups %ymm2, (%rax)
; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -296,12 +296,12 @@ define void @test8(<4 x i16>* %in, <4 x i64>* %out) nounwind {
;
; AVX1-LABEL: test8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwq 4(%rdi), %xmm0
; AVX1-NEXT: vpmovsxwq (%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vpmovsxwq (%rdi), %xmm0
; AVX1-NEXT: vpmovsxwq 4(%rdi), %xmm1
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vmovups %ymm2, (%rax)
; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -377,12 +377,12 @@ define void @test10(<8 x i16>* %in, <8 x i32>* %out) nounwind {
;
; AVX1-LABEL: test10:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0
; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vmovups %ymm2, (%rax)
; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -458,12 +458,12 @@ define void @test12(<4 x i32>* %in, <4 x i64>* %out) nounwind {
;
; AVX1-LABEL: test12:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm0
; AVX1-NEXT: vpmovsxdq (%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vpmovsxdq (%rdi), %xmm0
; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm1
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vmovups %ymm2, (%rax)
; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
Loading

0 comments on commit 606eb23

Please sign in to comment.