Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10157,6 +10157,18 @@ return true; } + +static bool AllowableAlignment(const SelectionDAG& DAG, + const TargetLowering& TLI, + EVT EVTTy, unsigned AS, unsigned Align) { + if (TLI.allowsMisalignedMemoryAccesses(EVTTy, AS, Align)) + return true; + + Type *Ty = EVTTy.getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment= TLI.getDataLayout()->getPrefTypeAlignment(Ty); + return (Align >= ABIAlignment); +} + bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { if (OptLevel == CodeGenOpt::None) return false; @@ -10223,10 +10235,6 @@ if (!Ptr.equalBaseIndex(BasePtr)) break; - // Check that the alignment is the same. - if (Index->getAlignment() != St->getAlignment()) - break; - // The memory operands must not be volatile. if (Index->isVolatile() || Index->isIndexed()) break; @@ -10240,11 +10248,6 @@ if (Index->getMemoryVT() != MemVT) break; - // We do not allow unaligned stores because we want to prevent overriding - // stores. - if (Index->getAlignment()*8 != MemVT.getSizeInBits()) - break; - // We found a potential memory operand to merge. StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++)); @@ -10318,6 +10321,8 @@ // The node with the lowest store address. LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned FirstStoreAS = FirstInChain->getAddressSpace(); + unsigned FirstStoreAlign = FirstInChain->getAlignment(); // Store the constants into memory as one consecutive store. if (IsConstantSrc) { @@ -10340,21 +10345,29 @@ // Find a legal type for the constant store. unsigned StoreBW = (i+1) * ElementSizeBytes * 8; EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); - if (TLI.isTypeLegal(StoreTy)) + if (TLI.isTypeLegal(StoreTy) && + AllowableAlignment(DAG, TLI, StoreTy, + FirstStoreAS, FirstStoreAlign)) { LastLegalType = i+1; // Or check whether a truncstore is legal. - else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == + } else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); - if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy)) - LastLegalType = i+1; + if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + AllowableAlignment(DAG, TLI, LegalizedStoredValueTy, + FirstStoreAS, FirstStoreAlign)) { + LastLegalType = i+1; + } } // Find a legal type for the vector store. EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); - if (TLI.isTypeLegal(Ty)) + if (TLI.isTypeLegal(Ty) && + AllowableAlignment(DAG, TLI, Ty, + FirstStoreAS, FirstStoreAlign)) { LastLegalVectorType = i + 1; + } } // We only use vectors if the constant is known to be zero and the @@ -10390,7 +10403,9 @@ // Find a legal type for the vector store. EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); - if (TLI.isTypeLegal(Ty)) + if (TLI.isTypeLegal(Ty) && + AllowableAlignment(DAG, TLI, Ty, + FirstStoreAS, FirstStoreAlign)) NumElem = i + 1; } @@ -10417,10 +10432,6 @@ if (!Ld->hasNUsesOfValue(1, 0)) break; - // Check that the alignment is the same as the stores. - if (Ld->getAlignment() != St->getAlignment()) - break; - // The memory operands must not be volatile. if (Ld->isVolatile() || Ld->isIndexed()) break; @@ -10458,6 +10469,10 @@ St->getAlignment() >= RequiredAlignment) return false; + LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode); + unsigned FirstLoadAS = FirstLoad->getAddressSpace(); + unsigned FirstLoadAlign = FirstLoad->getAlignment(); + // Scan the memory operations on the chain and find the first non-consecutive // load memory address. These variables hold the index in the store node // array. @@ -10466,7 +10481,7 @@ unsigned LastLegalVectorType = 0; unsigned LastLegalIntegerType = 0; StartAddress = LoadNodes[0].OffsetFromBase; - SDValue FirstChain = LoadNodes[0].MemNode->getChain(); + SDValue FirstChain = FirstLoad->getChain(); for (unsigned i = 1; i < LoadNodes.size(); ++i) { // All loads much share the same chain. if (LoadNodes[i].MemNode->getChain() != FirstChain) @@ -10479,13 +10494,22 @@ // Find a legal type for the vector store. EVT StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); - if (TLI.isTypeLegal(StoreTy)) + if (TLI.isTypeLegal(StoreTy) && + AllowableAlignment(DAG, TLI, StoreTy, + FirstStoreAS, FirstStoreAlign) && + AllowableAlignment(DAG, TLI, StoreTy, + FirstLoadAS, FirstLoadAlign)) { LastLegalVectorType = i + 1; + } // Find a legal type for the integer store. unsigned StoreBW = (i+1) * ElementSizeBytes * 8; StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); - if (TLI.isTypeLegal(StoreTy)) + if (TLI.isTypeLegal(StoreTy) && + AllowableAlignment(DAG, TLI, StoreTy, + FirstStoreAS, FirstStoreAlign) && + AllowableAlignment(DAG, TLI, StoreTy, + FirstLoadAS, FirstLoadAlign)) LastLegalIntegerType = i + 1; // Or check whether a truncstore and extload is legal. else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) == @@ -10495,7 +10519,11 @@ if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) && - TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy)) + TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) && + AllowableAlignment(DAG, TLI, LegalizedStoredValueTy, + FirstStoreAS, FirstStoreAlign) && + AllowableAlignment(DAG, TLI, LegalizedStoredValueTy, + FirstLoadAS, FirstLoadAlign)) LastLegalIntegerType = i+1; } } @@ -10538,18 +10566,17 @@ SDLoc LoadDL(LoadNodes[0].MemNode); SDLoc StoreDL(StoreNodes[0].MemNode); - LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode); SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), false, false, false, - FirstLoad->getAlignment()); + FirstLoadAlign); SDValue NewStore = DAG.getStore(EarliestOp->getChain(), StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), false, false, - FirstInChain->getAlignment()); + FirstStoreAlign); // Replace one of the loads with the new load. LoadSDNode *Ld = cast(LoadNodes[0].MemNode); Index: test/CodeGen/PowerPC/MergeConsecutiveStores.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/MergeConsecutiveStores.ll @@ -0,0 +1,68 @@ +; RUN: llc -march=ppc32 -mattr=+altivec < %s | FileCheck %s + +;; This test ensures that MergeConsecutiveStores does not attempt to +;; merge stores or loads when doing so would result in unaligned +;; memory operations (unless the target supports those, e.g. X86). + +;; This issue happen in other situations for other targets, but PPC +;; with Altivec extensions was chosen for the test because it does not +;; support unaligned access with AltiVec instructions. If the 4 +;; load/stores get merged to an v4i32 vector type severely bad code +;; gets generated: it painstakingly copies the values to a temporary +;; location on the stack, with vector ops, in order to then use +;; integer ops to load from the temporary stack location and store to +;; the final location. Yuck! + +%struct.X = type { i32, i32, i32, i32 } + +@fx = common global %struct.X zeroinitializer, align 4 +@fy = common global %struct.X zeroinitializer, align 4 + +;; In this test case, lvx and stvx instructions should NOT be +;; generated, as the alignment is not sufficient for it to be +;; worthwhile. + +;; CHECK-LABEL: f: +;; CHECK: lwzu +;; CHECK-NEXT: lwz +;; CHECK-NEXT: lwz +;; CHECK-NEXT: lwz +;; CHECK-NEXT: stwu +;; CHECK-NEXT: stw +;; CHECK-NEXT: stw +;; CHECK-NEXT: stw +;; CHECK-NEXT: blr +define void @f() { +entry: + %0 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 0), align 4 + %1 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 1), align 4 + %2 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 2), align 4 + %3 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 3), align 4 + store i32 %0, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 0), align 4 + store i32 %1, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 1), align 4 + store i32 %2, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 2), align 4 + store i32 %3, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 3), align 4 + ret void +} + +@gx = common global %struct.X zeroinitializer, align 16 +@gy = common global %struct.X zeroinitializer, align 16 + +;; In this test, lvx and stvx instructions SHOULD be generated, as +;; the 16-byte alignment of the new load/store is acceptable. +;; CHECK-LABEL: g: +;; CHECK: lvx +;; CHECK: stvx +;; CHECK: blr +define void @g() { +entry: + %0 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 0), align 16 + %1 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 1), align 4 + %2 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 2), align 4 + %3 = load i32, i32* getelementptr inbounds (%struct.X, %struct.X* @fx, i32 0, i32 3), align 4 + store i32 %0, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 0), align 16 + store i32 %1, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 1), align 4 + store i32 %2, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 2), align 4 + store i32 %3, i32* getelementptr inbounds (%struct.X, %struct.X* @fy, i32 0, i32 3), align 4 + ret void +} Index: test/CodeGen/X86/MergeConsecutiveStores.ll =================================================================== --- test/CodeGen/X86/MergeConsecutiveStores.ll +++ test/CodeGen/X86/MergeConsecutiveStores.ll @@ -291,17 +291,12 @@ ret void } +;; On x86, even unaligned copies can be merged to vector ops. ; CHECK-LABEL: merge_loads_no_align: ; load: -; CHECK: movl -; CHECK: movl -; CHECK: movl -; CHECK: movl +; CHECK: vmovups ; store: -; CHECK: movl -; CHECK: movl -; CHECK: movl -; CHECK: movl +; CHECK: vmovups ; CHECK: ret define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { %a1 = icmp sgt i32 %count, 0