diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -5145,17 +5145,19 @@ // TLI: Target lowering used to determine legal types. // Width: Width left need to load/store. // WidenVT: The widen vector type to load to/store from -// Align: If 0, don't allow use of a wider type -// WidenEx: If Align is not 0, the amount additional we can load/store from. +// NumDereferenceableBytes: If 0, don't allow use of a wider type +// WidenEx: If NumDereferenceableBytes is not 0, +// the additional amount we have to load/store. -static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI, +static EVT FindMemType(SelectionDAG &DAG, const TargetLowering &TLI, unsigned Width, EVT WidenVT, - unsigned Align = 0, unsigned WidenEx = 0) { + unsigned NumDereferenceableBytes = 0, + unsigned WidenEx = 0) { EVT WidenEltVT = WidenVT.getVectorElementType(); const bool Scalable = WidenVT.isScalableVector(); unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinSize(); unsigned WidenEltWidth = WidenEltVT.getSizeInBits(); - unsigned AlignInBits = Align*8; + unsigned NumDereferenceableBits = NumDereferenceableBytes * 8; // If we have one element to load/store, return it. EVT RetVT = WidenEltVT; @@ -5178,8 +5180,9 @@ Action == TargetLowering::TypePromoteInteger) && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && - (MemVTWidth <= Width || - (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + (MemVTWidth <= Width || (NumDereferenceableBytes != 0 && + MemVTWidth <= NumDereferenceableBits && + MemVTWidth <= Width + WidenEx))) { if (MemVTWidth == WidenWidth) return MemVT; RetVT = MemVT; @@ -5203,8 +5206,9 @@ WidenEltVT == MemVT.getVectorElementType() && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && - (MemVTWidth <= Width || - (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + (MemVTWidth <= Width || (NumDereferenceableBytes != 0 && + MemVTWidth <= NumDereferenceableBits && + MemVTWidth <= Width + WidenEx))) { if (RetVT.getFixedSizeInBits() < MemVTWidth || MemVT == WidenVT) return MemVT; } @@ -5270,13 +5274,24 @@ TypeSize LdWidth = LdVT.getSizeInBits(); TypeSize WidenWidth = WidenVT.getSizeInBits(); TypeSize WidthDiff = WidenWidth - LdWidth; - // Allow wider loads if they are sufficiently aligned to avoid memory faults - // and if the original load is simple. - unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment(); + unsigned NumDereferenceableBytes = 0; + // Allow wider loads if the original load is simple and we can dereference + // padding bytes. + if (LD->isSimple()) { + NumDereferenceableBytes = LD->getAlignment(); + if (!LdWidth.isScalable()) + NumDereferenceableBytes = + std::max(NumDereferenceableBytes, LdWidth / 8); + if (!WidenWidth.isScalable() && NumDereferenceableBytes < WidenWidth / 8 && + LD->getPointerInfo().isDereferenceable( + WidenWidth / 8, *DAG.getContext(), DAG.getDataLayout())) + NumDereferenceableBytes = + std::max(NumDereferenceableBytes, WidenWidth / 8); + } // Find the vector type that can load from. - EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign, - WidthDiff.getKnownMinSize()); + EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, + NumDereferenceableBytes, WidthDiff.getKnownMinSize()); TypeSize NewVTWidth = NewVT.getSizeInBits(); SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(), LD->getOriginalAlign(), MMOFlags, AAInfo); @@ -5312,13 +5327,27 @@ MachinePointerInfo MPI = LD->getPointerInfo(); do { LdWidth -= NewVTWidth; + if (!NewVTWidth.isScalable()) { + if (NumDereferenceableBytes > NewVTWidth / 8) + NumDereferenceableBytes -= NewVTWidth / 8; + else + NumDereferenceableBytes = 0; + NumDereferenceableBytes = std::max( + NumDereferenceableBytes, + commonAlignment(cast(LdOp)->getOriginalAlign(), + cast(LdOp)->getSrcValueOffset() + + NewVTWidth) + .value()); + } else + NumDereferenceableBytes = 0; // FIXME + IncrementPointer(cast(LdOp), NewVT, MPI, BasePtr, &ScaledOffset); if (TypeSize::isKnownLT(LdWidth, NewVTWidth)) { // The current type we are using is too large. Find a better size. - NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign, - WidthDiff.getKnownMinSize()); + NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, + NumDereferenceableBytes, WidthDiff.getKnownMinSize()); NewVTWidth = NewVT.getSizeInBits(); } diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -383,7 +383,7 @@ ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 -; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 @@ -454,12 +454,12 @@ ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 -; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 -; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9 +; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 +; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { entry: store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8 @@ -479,12 +479,12 @@ ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 -; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 -; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 -; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 +; SI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x9 +; MESA-VI-DAG: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 +; HSA-GFX9-DAG: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 +; HSA-GFX9-DAG: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { entry: store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8 diff --git a/llvm/test/CodeGen/ARM/vector-load.ll b/llvm/test/CodeGen/ARM/vector-load.ll --- a/llvm/test/CodeGen/ARM/vector-load.ll +++ b/llvm/test/CodeGen/ARM/vector-load.ll @@ -253,10 +253,8 @@ } ; CHECK-LABEL: test_silly_load: -; CHECK: vldr d{{[0-9]+}}, [r0, #16] -; CHECK: movs r1, #24 -; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128], r1 -; CHECK: ldr {{r[0-9]+}}, [r0] +; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]! +; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128] define void @test_silly_load(<28 x i8>* %addr) { load volatile <28 x i8>, <28 x i8>* %addr diff --git a/llvm/test/CodeGen/X86/load-partial-dot-product.ll b/llvm/test/CodeGen/X86/load-partial-dot-product.ll --- a/llvm/test/CodeGen/X86/load-partial-dot-product.ll +++ b/llvm/test/CodeGen/X86/load-partial-dot-product.ll @@ -130,14 +130,8 @@ define float @dot3_float3(float* dereferenceable(16) %a0, float* dereferenceable(16) %a1) { ; SSE2-LABEL: dot3_float3: ; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: movups (%rdi), %xmm0 +; SSE2-NEXT: movups (%rsi), %xmm1 ; SSE2-NEXT: mulps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] @@ -148,14 +142,8 @@ ; ; SSSE3-LABEL: dot3_float3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSSE3-NEXT: movups (%rdi), %xmm0 +; SSSE3-NEXT: movups (%rsi), %xmm1 ; SSSE3-NEXT: mulps %xmm0, %xmm1 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSSE3-NEXT: addss %xmm1, %xmm0 @@ -165,10 +153,8 @@ ; ; SSE41-LABEL: dot3_float3: ; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; SSE41-NEXT: movups (%rdi), %xmm0 +; SSE41-NEXT: movups (%rsi), %xmm1 ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -178,11 +164,8 @@ ; ; AVX-LABEL: dot3_float3: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmulps (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0