diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -5144,17 +5144,19 @@ // TLI: Target lowering used to determine legal types. // Width: Width left need to load/store. // WidenVT: The widen vector type to load to/store from -// Align: If 0, don't allow use of a wider type -// WidenEx: If Align is not 0, the amount additional we can load/store from. +// NumDereferenceableBytes: If 0, don't allow use of a wider type +// WidenEx: If NumDereferenceableBytes is not 0, +// the additional amount we have to load/store. -static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI, +static EVT FindMemType(SelectionDAG &DAG, const TargetLowering &TLI, unsigned Width, EVT WidenVT, - unsigned Align = 0, unsigned WidenEx = 0) { + unsigned NumDereferenceableBytes = 0, + unsigned WidenEx = 0) { EVT WidenEltVT = WidenVT.getVectorElementType(); const bool Scalable = WidenVT.isScalableVector(); unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinSize(); unsigned WidenEltWidth = WidenEltVT.getSizeInBits(); - unsigned AlignInBits = Align*8; + unsigned NumDereferenceableBits = NumDereferenceableBytes * 8; // If we have one element to load/store, return it. EVT RetVT = WidenEltVT; @@ -5174,8 +5176,9 @@ Action == TargetLowering::TypePromoteInteger) && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && - (MemVTWidth <= Width || - (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + (MemVTWidth <= Width || (NumDereferenceableBytes != 0 && + MemVTWidth <= NumDereferenceableBits && + MemVTWidth <= Width + WidenEx))) { if (MemVTWidth == WidenWidth) return MemVT; RetVT = MemVT; @@ -5197,8 +5200,9 @@ WidenEltVT == MemVT.getVectorElementType() && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && - (MemVTWidth <= Width || - (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + (MemVTWidth <= Width || (NumDereferenceableBytes != 0 && + MemVTWidth <= NumDereferenceableBits && + MemVTWidth <= Width + WidenEx))) { if (RetVT.getFixedSizeInBits() < MemVTWidth || MemVT == WidenVT) return MemVT; } @@ -5264,13 +5268,24 @@ TypeSize LdWidth = LdVT.getSizeInBits(); TypeSize WidenWidth = WidenVT.getSizeInBits(); TypeSize WidthDiff = WidenWidth - LdWidth; - // Allow wider loads if they are sufficiently aligned to avoid memory faults - // and if the original load is simple. - unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment(); + unsigned NumDereferenceableBytes = 0; + // Allow wider loads if the original load is simple and we can dereference + // padding bytes. + if (LD->isSimple()) { + NumDereferenceableBytes = LD->getAlignment(); + if (!LdWidth.isScalable()) + NumDereferenceableBytes = + std::max(NumDereferenceableBytes, LdWidth / 8); + if (!WidenWidth.isScalable() && NumDereferenceableBytes < WidenWidth / 8 && + LD->getPointerInfo().isDereferenceable( + WidenWidth / 8, *DAG.getContext(), DAG.getDataLayout())) + NumDereferenceableBytes = + std::max(NumDereferenceableBytes, WidenWidth / 8); + } // Find the vector type that can load from. - EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign, - WidthDiff.getKnownMinSize()); + EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, + NumDereferenceableBytes, WidthDiff.getKnownMinSize()); TypeSize NewVTWidth = NewVT.getSizeInBits(); SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(), LD->getOriginalAlign(), MMOFlags, AAInfo); @@ -5306,13 +5321,29 @@ MachinePointerInfo MPI = LD->getPointerInfo(); do { LdWidth -= NewVTWidth; + if (LD->isSimple()) { + if (!NewVTWidth.isScalable()) { + if (NumDereferenceableBytes > NewVTWidth / 8) + NumDereferenceableBytes -= NewVTWidth / 8; + else + NumDereferenceableBytes = 0; + NumDereferenceableBytes = std::max( + NumDereferenceableBytes, + commonAlignment(cast(LdOp)->getOriginalAlign(), + cast(LdOp)->getSrcValueOffset() + + NewVTWidth) + .value()); + } else + NumDereferenceableBytes = 0; // FIXME + } + IncrementPointer(cast(LdOp), NewVT, MPI, BasePtr, &ScaledOffset); if (TypeSize::isKnownLT(LdWidth, NewVTWidth)) { // The current type we are using is too large. Find a better size. - NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign, - WidthDiff.getKnownMinSize()); + NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, + NumDereferenceableBytes, WidthDiff.getKnownMinSize()); NewVTWidth = NewVT.getSizeInBits(); } diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -1696,35 +1696,33 @@ define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind { ; SI-LABEL: v5i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xc -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:4 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dword s1, s[0:1], 0x30 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 4 +; VI-NEXT: s_add_u32 s4, s2, 4 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_byte v[2:3], v4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: flat_store_byte v[2:3], v4 +; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i8_arg: @@ -2269,82 +2267,80 @@ define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { ; SI-LABEL: v5i64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_add_u32 s8, s2, 16 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_add_u32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_addc_u32 s5, s17, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s16 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s16, 32 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: s_addc_u32 s1, s17, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:32 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i64_arg: @@ -2412,82 +2408,80 @@ define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { ; SI-LABEL: v5f64_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5f64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_add_u32 s8, s2, 16 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_add_u32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_addc_u32 s5, s17, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s16 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s16, 32 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v5, s17 +; VI-NEXT: s_addc_u32 s1, s17, 0 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5f64_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:32 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5f64_arg: diff --git a/llvm/test/CodeGen/X86/load-partial-dot-product.ll b/llvm/test/CodeGen/X86/load-partial-dot-product.ll --- a/llvm/test/CodeGen/X86/load-partial-dot-product.ll +++ b/llvm/test/CodeGen/X86/load-partial-dot-product.ll @@ -130,14 +130,8 @@ define float @dot3_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) { ; SSE2-LABEL: dot3_float3: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: movups (%rdi), %xmm0 +; SSE2-NEXT: movups (%rsi), %xmm1 ; SSE2-NEXT: mulps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] @@ -148,14 +142,8 @@ ; ; SSSE3-LABEL: dot3_float3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSSE3-NEXT: movups (%rdi), %xmm0 +; SSSE3-NEXT: movups (%rsi), %xmm1 ; SSSE3-NEXT: mulps %xmm0, %xmm1 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSSE3-NEXT: addss %xmm1, %xmm0 @@ -165,10 +153,8 @@ ; ; SSE41-LABEL: dot3_float3: ; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; SSE41-NEXT: movups (%rdi), %xmm0 +; SSE41-NEXT: movups (%rsi), %xmm1 ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -178,11 +164,8 @@ ; ; AVX-LABEL: dot3_float3: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0