diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -5145,17 +5145,18 @@ // TLI: Target lowering used to determine legal types. // Width: Width left need to load/store. // WidenVT: The widen vector type to load to/store from -// Align: If 0, don't allow use of a wider type +// NumDereferenceableBytes: If 0, don't allow use of a wider type // WidenEx: If Align is not 0, the amount additional we can load/store from. -static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI, +static EVT FindMemType(SelectionDAG &DAG, const TargetLowering &TLI, unsigned Width, EVT WidenVT, - unsigned Align = 0, unsigned WidenEx = 0) { + unsigned NumDereferenceableBytes = 0, + unsigned WidenEx = 0) { EVT WidenEltVT = WidenVT.getVectorElementType(); const bool Scalable = WidenVT.isScalableVector(); unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinSize(); unsigned WidenEltWidth = WidenEltVT.getSizeInBits(); - unsigned AlignInBits = Align*8; + unsigned NumDereferenceableBits = NumDereferenceableBytes * 8; // If we have one element to load/store, return it. EVT RetVT = WidenEltVT; @@ -5178,8 +5179,9 @@ Action == TargetLowering::TypePromoteInteger) && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && - (MemVTWidth <= Width || - (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + (MemVTWidth <= Width || (NumDereferenceableBytes != 0 && + MemVTWidth <= NumDereferenceableBits && + MemVTWidth <= Width + WidenEx))) { if (MemVTWidth == WidenWidth) return MemVT; RetVT = MemVT; @@ -5203,8 +5205,9 @@ WidenEltVT == MemVT.getVectorElementType() && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && - (MemVTWidth <= Width || - (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + (MemVTWidth <= Width || (NumDereferenceableBytes != 0 && + MemVTWidth <= NumDereferenceableBits && + MemVTWidth <= Width + WidenEx))) { if (RetVT.getFixedSizeInBits() < MemVTWidth || MemVT == WidenVT) return MemVT; } @@ -5270,13 +5273,23 @@ TypeSize LdWidth = LdVT.getSizeInBits(); TypeSize WidenWidth = WidenVT.getSizeInBits(); TypeSize WidthDiff = WidenWidth - LdWidth; - // Allow wider loads if they are sufficiently aligned to avoid memory faults - // and if the original load is simple. - unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment(); + unsigned NumDereferenceableBytes = 0; + // Allow wider loads if the original load is simple. + if (LD->isSimple()) { + NumDereferenceableBytes = LD->getAlignment(); + if (!LdWidth.isScalable()) + NumDereferenceableBytes = + std::max(NumDereferenceableBytes, (unsigned)LdWidth / 8); + if (!WidenWidth.isScalable() && NumDereferenceableBytes < WidenWidth / 8 && + LD->getPointerInfo().isDereferenceable( + WidenWidth / 8, *DAG.getContext(), DAG.getDataLayout())) + NumDereferenceableBytes = + std::max(NumDereferenceableBytes, (unsigned)WidenWidth / 8); + } // Find the vector type that can load from. - EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign, - WidthDiff.getKnownMinSize()); + EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, + NumDereferenceableBytes, WidthDiff.getKnownMinSize()); TypeSize NewVTWidth = NewVT.getSizeInBits(); SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(), LD->getOriginalAlign(), MMOFlags, AAInfo); @@ -5312,13 +5325,17 @@ MachinePointerInfo MPI = LD->getPointerInfo(); do { LdWidth -= NewVTWidth; + if (!NewVTWidth.isScalable()) + NumDereferenceableBytes -= NewVTWidth; + else + NumDereferenceableBytes = 0; IncrementPointer(cast(LdOp), NewVT, MPI, BasePtr, &ScaledOffset); if (TypeSize::isKnownLT(LdWidth, NewVTWidth)) { // The current type we are using is too large. Find a better size. - NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign, - WidthDiff.getKnownMinSize()); + NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, + NumDereferenceableBytes, WidthDiff.getKnownMinSize()); NewVTWidth = NewVT.getSizeInBits(); } diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -162,13 +162,13 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s10, -1 @@ -196,14 +196,14 @@ ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1085,27 +1085,28 @@ ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: flat_load_ubyte v11, v[6:7] -; VI-NEXT: flat_load_ubyte v8, v[8:9] -; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: v_add_u32_e32 v10, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v0 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v14, v[4:5] +; VI-NEXT: flat_load_ubyte v4, v[6:7] +; VI-NEXT: flat_load_ubyte v5, v[8:9] +; VI-NEXT: flat_load_ubyte v6, v[10:11] +; VI-NEXT: flat_load_ubyte v7, v[12:13] ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v7 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11 +; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 @@ -1118,26 +1119,25 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x5 +; GFX10-NEXT: s_clause 0x6 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:5 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:1 ; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 8, v1 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v5, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -383,7 +383,7 @@ ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 -; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 @@ -454,12 +454,12 @@ ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 -; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 -; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9 +; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 +; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { entry: store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8 @@ -479,12 +479,12 @@ ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 -; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 -; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9 +; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 +; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { entry: store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -6,11 +6,11 @@ ; SI-LABEL: select_f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 ; SI-NEXT: s_mov_b32 s19, s3 @@ -47,11 +47,11 @@ ; VI-LABEL: select_f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x44 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 ; VI-NEXT: s_mov_b32 s19, s3 @@ -421,11 +421,11 @@ ; SI-LABEL: select_v2f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 ; SI-NEXT: s_mov_b32 s19, s3 @@ -475,11 +475,11 @@ ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x44 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 ; VI-NEXT: s_mov_b32 s19, s3 diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -68,11 +68,11 @@ ; SI-LABEL: madak_f16_use_2: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s16, s8 ; SI-NEXT: s_mov_b32 s17, s9 ; SI-NEXT: s_mov_b32 s19, s3 @@ -107,11 +107,11 @@ ; VI-LABEL: madak_f16_use_2: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x44 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s16, s8 ; VI-NEXT: s_mov_b32 s17, s9 ; VI-NEXT: s_mov_b32 s19, s3 diff --git a/llvm/test/CodeGen/ARM/legalize-bitcast.ll b/llvm/test/CodeGen/ARM/legalize-bitcast.ll --- a/llvm/test/CodeGen/ARM/legalize-bitcast.ll +++ b/llvm/test/CodeGen/ARM/legalize-bitcast.ll @@ -11,11 +11,9 @@ ; CHECK-NEXT: movw r0, :lower16:vec6_p ; CHECK-NEXT: movt r0, :upper16:vec6_p ; CHECK-NEXT: vld1.8 {d16}, [r0]! -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: @ implicit-def: $d17 -; CHECK-NEXT: vmov.32 d17[0], r0 -; CHECK-NEXT: vrev32.16 d18, d17 ; CHECK-NEXT: vrev16.8 d16, d16 +; CHECK-NEXT: vld1.64 {d17}, [r0] +; CHECK-NEXT: vrev64.16 d18, d17 ; CHECK-NEXT: @ kill: def $d16 killed $d16 def $q8 ; CHECK-NEXT: vmov.f64 d17, d18 ; CHECK-NEXT: vstmia sp, {d16, d17} @ 16-byte Spill diff --git a/llvm/test/CodeGen/ARM/vector-load.ll b/llvm/test/CodeGen/ARM/vector-load.ll --- a/llvm/test/CodeGen/ARM/vector-load.ll +++ b/llvm/test/CodeGen/ARM/vector-load.ll @@ -253,10 +253,8 @@ } ; CHECK-LABEL: test_silly_load: -; CHECK: vldr d{{[0-9]+}}, [r0, #16] -; CHECK: movs r1, #24 -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1 -; CHECK: ldr {{r[0-9]+}}, [r0] +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]! +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128] define void @test_silly_load(<28 x i8>* %addr) { load volatile <28 x i8>, <28 x i8>* %addr diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -6,21 +6,26 @@ define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) { ; CHECK-LABEL: vld3_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmov.f64 d2, d0 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov r12, lr, d0 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: add.w r3, r12, lr +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s8, s1 +; CHECK-NEXT: vmov.f64 d6, d0 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f64 d6, d1 +; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: strd r2, r0, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %l1 = load <6 x i32>, <6 x i32>* %src, align 4 %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> @@ -210,21 +215,16 @@ define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) { ; CHECK-LABEL: vld3_v2i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: ldr r2, [r0, #8] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: str r2, [sp] -; CHECK-NEXT: vmov.f64 d2, d0 -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] ; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldrh.u32 q1, [r3] +; CHECK-NEXT: vmov.f64 d6, d0 +; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f64 d6, d1 ; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: add r0, r2 @@ -235,7 +235,6 @@ ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strh r0, [r1] -; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr entry: %l1 = load <6 x i16>, <6 x i16>* %src, align 4 @@ -251,30 +250,30 @@ define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) { ; CHECK-LABEL: vld3_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r5, q0[6] -; CHECK-NEXT: vmov.u16 r6, q0[0] -; CHECK-NEXT: vmov r0, r3, d2 -; CHECK-NEXT: vmov.u16 lr, q0[2] -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r5 -; CHECK-NEXT: vmov.u16 r5, q0[7] -; CHECK-NEXT: vmov.u16 r6, q0[1] -; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 -; CHECK-NEXT: vmov.u16 r5, q0[3] -; CHECK-NEXT: vmov.u16 r6, q0[4] -; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r6, r2 -; CHECK-NEXT: vmov.u16 r12, q0[5] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 ; CHECK-NEXT: vadd.i32 q0, q1, q2 -; CHECK-NEXT: vmov q1[2], q1[0], lr, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vstrh.32 q0, [r1] -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: bx lr entry: %l1 = load <12 x i16>, <12 x i16>* %src, align 4 %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> @@ -504,38 +503,30 @@ define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) { ; CHECK-LABEL: vld3_v4i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrb.u16 q0, [r0] -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: str r0, [sp] -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov.u16 r4, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov.u16 r4, q0[1] -; CHECK-NEXT: vmov.u16 r12, q0[5] -; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vmov.u16 lr, q0[2] +; CHECK-NEXT: vldrb.u16 q3, [r0, #8] +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 +; CHECK-NEXT: vmov.u16 r0, q3[1] ; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vldrb.u16 q0, [r3] -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 -; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vmov q2[2], q2[0], lr, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov q2[3], q2[1], r12, r0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 ; CHECK-NEXT: vadd.i32 q0, q1, q2 ; CHECK-NEXT: vstrb.32 q0, [r1] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: bx lr entry: %l1 = load <12 x i8>, <12 x i8>* %src, align 4 %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> @@ -550,41 +541,42 @@ define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) { ; CHECK-LABEL: vld3_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrb.u16 q1, [r0, #16] -; CHECK-NEXT: vmov.u8 r2, q0[1] -; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.u8 r0, q1[2] ; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] ; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmov.u8 r0, q0[13] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vmov.f32 s18, s5 -; CHECK-NEXT: vmovx.f16 s11, s5 +; CHECK-NEXT: vmov.u8 r0, q1[0] ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vins.f16 s18, s16 -; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vadd.i16 q2, q3, q2 +; CHECK-NEXT: vadd.i16 q2, q2, q3 ; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] ; CHECK-NEXT: vmov.16 q3[1], r0 @@ -594,15 +586,14 @@ ; CHECK-NEXT: vmov.16 q3[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[14] ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.f32 s15, s2 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vadd.i16 q0, q2, q3 ; CHECK-NEXT: vstrb.16 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x i8>, <24 x i8>* %src, align 4 @@ -871,15 +862,14 @@ define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) { ; CHECK-LABEL: vld3_v2f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldr s1, [r0, #16] -; CHECK-NEXT: vldr s5, [r0, #20] -; CHECK-NEXT: vmov.f64 d6, d4 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vadd.f32 q0, q3, q0 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vadd.f32 q2, q1, q2 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: vstmia r1, {s0, s1} ; CHECK-NEXT: bx lr entry: @@ -1102,32 +1092,28 @@ define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: ldrd r2, r3, [r0, #16] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s12, s1 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s5, s3 +; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vmovx.f16 s12, s2 +; CHECK-NEXT: vins.f16 s1, s12 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s8, s12 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vadd.f16 q1, q3, q1 ; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmovx.f16 s4, s9 -; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vins.f16 s12, s6 -; CHECK-NEXT: vins.f16 s4, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vmovx.f16 s13, s7 -; CHECK-NEXT: vins.f16 s7, s8 -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s13, s9 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vadd.f16 q1, q1, q3 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x half>, <12 x half>* %src, align 4 diff --git a/llvm/test/CodeGen/X86/2012-01-11-split-cv.ll b/llvm/test/CodeGen/X86/2012-01-11-split-cv.ll --- a/llvm/test/CodeGen/X86/2012-01-11-split-cv.ll +++ b/llvm/test/CodeGen/X86/2012-01-11-split-cv.ll @@ -8,8 +8,8 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: vmovups (%ecx), %ymm0 ; CHECK-NEXT: movl 32(%ecx), %ecx -; CHECK-NEXT: movl %ecx, 32(%eax) ; CHECK-NEXT: vmovups %ymm0, (%eax) +; CHECK-NEXT: movl %ecx, 32(%eax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl $4 %b = load <18 x i16>, <18 x i16>* %bp, align 16 diff --git a/llvm/test/CodeGen/X86/load-partial-dot-product.ll b/llvm/test/CodeGen/X86/load-partial-dot-product.ll --- a/llvm/test/CodeGen/X86/load-partial-dot-product.ll +++ b/llvm/test/CodeGen/X86/load-partial-dot-product.ll @@ -130,14 +130,8 @@ define float @dot3_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) { ; SSE2-LABEL: dot3_float3: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: movups (%rdi), %xmm0 +; SSE2-NEXT: movups (%rsi), %xmm1 ; SSE2-NEXT: mulps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] @@ -148,14 +142,8 @@ ; ; SSSE3-LABEL: dot3_float3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSSE3-NEXT: movups (%rdi), %xmm0 +; SSSE3-NEXT: movups (%rsi), %xmm1 ; SSSE3-NEXT: mulps %xmm0, %xmm1 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSSE3-NEXT: addss %xmm1, %xmm0 @@ -165,10 +153,8 @@ ; ; SSE41-LABEL: dot3_float3: ; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; SSE41-NEXT: movups (%rdi), %xmm0 +; SSE41-NEXT: movups (%rsi), %xmm1 ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -178,11 +164,8 @@ ; ; AVX-LABEL: dot3_float3: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -693,7 +693,7 @@ ; SSE2-LABEL: interleave_24i8_out: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: movdqu 16(%rdi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm2 @@ -752,7 +752,7 @@ ; SSE42-LABEL: interleave_24i8_out: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqu (%rdi), %xmm0 -; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: movdqu 16(%rdi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[2,5,u,u,u,u,u,u,u,u] ; SSE42-NEXT: movdqa %xmm0, %xmm3 @@ -774,7 +774,7 @@ ; AVX-LABEL: interleave_24i8_out: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovdqu 16(%rdi), %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -791,11 +791,11 @@ ; ; XOP-LABEL: interleave_24i8_out: ; XOP: # %bb.0: -; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; XOP-NEXT: vmovdqu (%rdi), %xmm1 -; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,3,6,9,12,15],xmm0[2,5],xmm1[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u] +; XOP-NEXT: vmovdqu (%rdi), %xmm0 +; XOP-NEXT: vmovdqu 16(%rdi), %xmm1 +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,3,6,9,12,15],xmm1[2,5],xmm0[u,u,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[1,4,7,10,13],xmm1[0,3,6],xmm0[u,u,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[2,5,8,11,14],xmm1[1,4,7],xmm0[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm2, (%rsi) ; XOP-NEXT: vmovq %xmm3, (%rdx) ; XOP-NEXT: vmovq %xmm0, (%rcx) diff --git a/llvm/test/CodeGen/X86/pr34657.ll b/llvm/test/CodeGen/X86/pr34657.ll --- a/llvm/test/CodeGen/X86/pr34657.ll +++ b/llvm/test/CodeGen/X86/pr34657.ll @@ -6,11 +6,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vmovups (%rsi), %zmm0 -; CHECK-NEXT: vmovups 64(%rsi), %ymm1 -; CHECK-NEXT: vmovups 96(%rsi), %xmm2 -; CHECK-NEXT: vmovaps %xmm2, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovups 64(%rsi), %zmm1 +; CHECK-NEXT: vextractf32x4 $2, %zmm1, 96(%rdi) ; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr46820.ll b/llvm/test/CodeGen/X86/pr46820.ll --- a/llvm/test/CodeGen/X86/pr46820.ll +++ b/llvm/test/CodeGen/X86/pr46820.ll @@ -11,15 +11,14 @@ ; CHECK-LABEL: load23: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups 64(%rsi), %ymm0 -; CHECK-NEXT: vmovups (%rsi), %zmm1 -; CHECK-NEXT: vmovaps 64(%rsi), %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss %xmm3, 88(%rdi) -; CHECK-NEXT: vmovaps %xmm2, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovlps %xmm0, 80(%rdi) +; CHECK-NEXT: vmovups (%rsi), %zmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss %xmm1, 88(%rdi) +; CHECK-NEXT: movq 80(%rsi), %rcx +; CHECK-NEXT: movq %rcx, 80(%rdi) +; CHECK-NEXT: vmovaps 64(%rsi), %xmm1 +; CHECK-NEXT: vmovaps %xmm1, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = load <23 x float>, <23 x float>* %p, align 16 @@ -33,11 +32,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vmovups (%rsi), %zmm0 +; CHECK-NEXT: vmovups 80(%rsi), %xmm1 +; CHECK-NEXT: vextractps $2, %xmm1, 88(%rdi) +; CHECK-NEXT: vmovups 80(%rsi), %xmm1 +; CHECK-NEXT: vmovlps %xmm1, 80(%rdi) ; CHECK-NEXT: vmovups 64(%rsi), %xmm1 -; CHECK-NEXT: movq 80(%rsi), %rcx -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss %xmm2, 88(%rdi) -; CHECK-NEXT: movq %rcx, 80(%rdi) ; CHECK-NEXT: vmovaps %xmm1, 64(%rdi) ; CHECK-NEXT: vmovaps %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -377,11 +377,10 @@ ; X86-AVX512-LABEL: PR39483: ; X86-AVX512: # %bb.0: # %entry ; X86-AVX512-NEXT: vmovups 0, %zmm0 -; X86-AVX512-NEXT: vmovups 64, %ymm1 -; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] -; X86-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 +; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,8,11,14,17,20,23] +; X86-AVX512-NEXT: vpermi2ps 64, %zmm0, %zmm1 ; X86-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X86-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1 +; X86-AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm1 ; X86-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; X86-AVX512-NEXT: vmovups %ymm0, (%eax) ; @@ -420,11 +419,10 @@ ; X64-AVX512-LABEL: PR39483: ; X64-AVX512: # %bb.0: # %entry ; X64-AVX512-NEXT: vmovups 0, %zmm0 -; X64-AVX512-NEXT: vmovups 64, %ymm1 -; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] -; X64-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 +; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,8,11,14,17,20,23] +; X64-AVX512-NEXT: vpermi2ps 64, %zmm0, %zmm1 ; X64-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1 +; X64-AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm1 ; X64-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; X64-AVX512-NEXT: vmovups %ymm0, (%rax) entry: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll @@ -33,17 +33,17 @@ ; ; AVX2-LABEL: foo: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu 32(%rdi), %xmm0 -; AVX2-NEXT: vmovdqu (%rdi), %ymm1 -; AVX2-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: foo: @@ -62,26 +62,25 @@ ; ; AVX512BW-LABEL: foo: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $63488, %eax # imm = 0xF800 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} +; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 +; AVX512BW-NEXT: vpermi2w %ymm1, %ymm2, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI-LABEL: foo: ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm2 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,3,4,6,7,9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,33,34,36,37,39,40,42,43,45,46] -; AVX512VBMI-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 +; AVX512VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 ; AVX512VBMI-NEXT: retq %1 = load <48 x i8>, <48 x i8>* %x0, align 1 %2 = shufflevector <48 x i8> %1, <48 x i8> undef, <32 x i32> diff --git a/llvm/test/CodeGen/X86/widen_load-3.ll b/llvm/test/CodeGen/X86/widen_load-3.ll --- a/llvm/test/CodeGen/X86/widen_load-3.ll +++ b/llvm/test/CodeGen/X86/widen_load-3.ll @@ -89,11 +89,10 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-AVX-NEXT: vmovups 48(%ecx), %xmm1 +; X86-AVX-NEXT: vextractps $1, %xmm1, 52(%eax) +; X86-AVX-NEXT: vmovss %xmm1, 48(%eax) ; X86-AVX-NEXT: vmovups 32(%ecx), %xmm1 -; X86-AVX-NEXT: movl 48(%ecx), %edx -; X86-AVX-NEXT: movl 52(%ecx), %ecx -; X86-AVX-NEXT: movl %ecx, 52(%eax) -; X86-AVX-NEXT: movl %edx, 48(%eax) ; X86-AVX-NEXT: vmovaps %xmm1, 32(%eax) ; X86-AVX-NEXT: vmovaps %ymm0, (%eax) ; X86-AVX-NEXT: vzeroupper @@ -116,9 +115,9 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: movq %rdi, %rax ; X64-AVX-NEXT: vmovups (%rsi), %ymm0 +; X64-AVX-NEXT: vmovups 48(%rsi), %xmm1 +; X64-AVX-NEXT: vmovlps %xmm1, 48(%rdi) ; X64-AVX-NEXT: vmovups 32(%rsi), %xmm1 -; X64-AVX-NEXT: movq 48(%rsi), %rcx -; X64-AVX-NEXT: movq %rcx, 48(%rdi) ; X64-AVX-NEXT: vmovaps %xmm1, 32(%rdi) ; X64-AVX-NEXT: vmovaps %ymm0, (%rdi) ; X64-AVX-NEXT: vzeroupper