Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11045,9 +11045,10 @@ // Find a legal type for the constant store. unsigned SizeInBits = (i+1) * ElementSizeBytes * 8; EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); + bool IsFast; if (TLI.isTypeLegal(StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign)) { + FirstStoreAlign, &IsFast) && IsFast) { LastLegalType = i+1; // Or check whether a truncstore is legal. } else if (TLI.getTypeAction(Context, StoreTy) == @@ -11056,7 +11057,8 @@ TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign)) { + FirstStoreAS, FirstStoreAlign, &IsFast) && + IsFast) { LastLegalType = i + 1; } } @@ -11071,7 +11073,7 @@ EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); if (TLI.isTypeLegal(Ty) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign)) + FirstStoreAlign, &IsFast) && IsFast) LastLegalVectorType = i + 1; } } @@ -11104,9 +11106,10 @@ // Find a legal type for the vector store. EVT Ty = EVT::getVectorVT(Context, MemVT, i+1); + bool IsFast; if (TLI.isTypeLegal(Ty) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, - FirstStoreAlign)) + FirstStoreAlign, &IsFast) && IsFast) NumElem = i + 1; } @@ -11192,14 +11195,14 @@ if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; LastConsecutiveLoad = i; - // Find a legal type for the vector store. EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1); + bool IsFastSt, IsFastLd; if (TLI.isTypeLegal(StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign) && + FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign)) { + FirstLoadAlign, &IsFastLd) && IsFastLd) { LastLegalVectorType = i + 1; } @@ -11208,9 +11211,9 @@ StoreTy = EVT::getIntegerVT(Context, SizeInBits); if (TLI.isTypeLegal(StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, - FirstStoreAlign) && + FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, - FirstLoadAlign)) + FirstLoadAlign, &IsFastLd) && IsFastLd) LastLegalIntegerType = i + 1; // Or check whether a truncstore and extload is legal. else if (TLI.getTypeAction(Context, StoreTy) == @@ -11222,9 +11225,11 @@ TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstStoreAS, FirstStoreAlign) && + FirstStoreAS, FirstStoreAlign, &IsFastSt) && + IsFastSt && TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, - FirstLoadAS, FirstLoadAlign)) + FirstLoadAS, FirstLoadAlign, &IsFastLd) && + IsFastLd) LastLegalIntegerType = i+1; } } Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -409,7 +409,10 @@ // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. - return Align % 4 == 0; + bool AlignedBy4 = (Align % 4 == 0); + if (IsFast) + *IsFast = AlignedBy4; + return AlignedBy4; } // Smaller than dword value must be aligned. Index: llvm/trunk/test/CodeGen/X86/dag-merge-fast-accesses.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/dag-merge-fast-accesses.ll +++ llvm/trunk/test/CodeGen/X86/dag-merge-fast-accesses.ll @@ -0,0 +1,81 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-slow-unaligned-mem-16 | FileCheck %s --check-prefix=FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+slow-unaligned-mem-16 | FileCheck %s --check-prefix=SLOW + +; Verify that the DAGCombiner is creating unaligned 16-byte loads and stores +; if and only if those are fast. + +define void @merge_const_vec_store(i64* %ptr) { +; FAST-LABEL: merge_const_vec_store: +; FAST: # BB#0: +; FAST-NEXT: xorps %xmm0, %xmm0 +; FAST-NEXT: movups %xmm0, (%rdi) +; FAST-NEXT: retq +; +; SLOW-LABEL: merge_const_vec_store: +; SLOW: # BB#0: +; SLOW-NEXT: movq $0, (%rdi) +; SLOW-NEXT: movq $0, 8(%rdi) +; SLOW-NEXT: retq + + %idx0 = getelementptr i64, i64* %ptr, i64 0 + %idx1 = getelementptr i64, i64* %ptr, i64 1 + + store i64 0, i64* %idx0, align 8 + store i64 0, i64* %idx1, align 8 + ret void +} + + +define void @merge_vec_element_store(<4 x double> %v, double* %ptr) { +; FAST-LABEL: merge_vec_element_store: +; FAST: # BB#0: +; FAST-NEXT: movups %xmm0, (%rdi) +; FAST-NEXT: retq +; +; SLOW-LABEL: merge_vec_element_store: +; SLOW: # BB#0: +; SLOW-NEXT: movlpd %xmm0, (%rdi) +; SLOW-NEXT: movhpd %xmm0, 8(%rdi) +; SLOW-NEXT: retq + + %vecext0 = extractelement <4 x double> %v, i32 0 + %vecext1 = extractelement <4 x double> %v, i32 1 + + %idx0 = getelementptr double, double* %ptr, i64 0 + %idx1 = getelementptr double, double* %ptr, i64 1 + + store double %vecext0, double* %idx0, align 8 + store double %vecext1, double* %idx1, align 8 + ret void +} + + +define void @merge_vec_load_and_stores(i64 *%ptr) { +; FAST-LABEL: merge_vec_load_and_stores: +; FAST: # BB#0: +; FAST-NEXT: movups (%rdi), %xmm0 +; FAST-NEXT: movups %xmm0, 40(%rdi) +; FAST-NEXT: retq +; +; SLOW-LABEL: merge_vec_load_and_stores: +; SLOW: # BB#0: +; SLOW-NEXT: movq (%rdi), %rax +; SLOW-NEXT: movq 8(%rdi), %rcx +; SLOW-NEXT: movq %rax, 40(%rdi) +; SLOW-NEXT: movq %rcx, 48(%rdi) +; SLOW-NEXT: retq + + %idx0 = getelementptr i64, i64* %ptr, i64 0 + %idx1 = getelementptr i64, i64* %ptr, i64 1 + + %ld0 = load i64, i64* %idx0, align 4 + %ld1 = load i64, i64* %idx1, align 4 + + %idx4 = getelementptr i64, i64* %ptr, i64 5 + %idx5 = getelementptr i64, i64* %ptr, i64 6 + + store i64 %ld0, i64* %idx4, align 4 + store i64 %ld1, i64* %idx5, align 4 + ret void +} +