diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16541,14 +16541,27 @@ } while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) { - // If we have load/store pair instructions and we only have two values, - // don't bother merging. Align RequiredAlignment; - if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && - StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) { - StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); - LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2); - break; + bool NeedRotate = false; + if (LoadNodes.size() == 2) { + // If we have load/store pair instructions and we only have two values, + // don't bother merging. + if (TLI.hasPairedLoad(MemVT, RequiredAlignment) && + StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); + LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2); + break; + } + // If the loads are reversed, see if we can rotate the halves into place. + int64_t Offset0 = LoadNodes[0].OffsetFromBase; + int64_t Offset1 = LoadNodes[1].OffsetFromBase; + EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2); + if (Offset0 - Offset1 == ElementSizeBytes && + (hasOperation(ISD::ROTL, PairVT) || + hasOperation(ISD::ROTR, PairVT))) { + std::swap(LoadNodes[0], LoadNodes[1]); + NeedRotate = true; + } } LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); @@ -16713,8 +16726,18 @@ NewLoad = DAG.getLoad( JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags); + SDValue StoreOp = NewLoad; + if (NeedRotate) { + unsigned LoadWidth = ElementSizeBytes * 8 * 2; + assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) && + "Unexpected type for rotate-able load pair"); + SDValue RotAmt = + DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL); + // Target can convert to the identical ROTR if it does not have ROTL. + StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt); + } NewStore = DAG.getStore( - NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), + NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags); } else { // This must be the truncstore/extload case EVT ExtendedTy = diff --git a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll --- a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll +++ b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll @@ -95,6 +95,8 @@ ret void } +; TODO: rev16? + define void @rotate16_in_place(i8* %p) { ; A53-LABEL: rotate16_in_place: ; A53: // %bb.0: @@ -112,6 +114,8 @@ ret void } +; TODO: rev16? + define void @rotate16(i8* %p, i8* %q) { ; A53-LABEL: rotate16: ; A53: // %bb.0: @@ -134,10 +138,9 @@ define void @rotate32_in_place(i16* %p) { ; A53-LABEL: rotate32_in_place: ; A53: // %bb.0: -; A53-NEXT: ldrh w8, [x0, #2] -; A53-NEXT: ldrh w9, [x0] -; A53-NEXT: strh w8, [x0] -; A53-NEXT: strh w9, [x0, #2] +; A53-NEXT: ldr w8, [x0] +; A53-NEXT: ror w8, w8, #16 +; A53-NEXT: str w8, [x0] ; A53-NEXT: ret %p0 = getelementptr i16, i16* %p, i64 0 %p1 = getelementptr i16, i16* %p, i64 1 @@ -151,10 +154,9 @@ define void @rotate32(i16* %p) { ; A53-LABEL: rotate32: ; A53: // %bb.0: -; A53-NEXT: ldrh w8, [x0, #2] -; A53-NEXT: ldrh w9, [x0] -; A53-NEXT: strh w8, [x0, #84] -; A53-NEXT: strh w9, [x0, #86] +; A53-NEXT: ldr w8, [x0] +; A53-NEXT: ror w8, w8, #16 +; A53-NEXT: str w8, [x0, #84] ; A53-NEXT: ret %p0 = getelementptr i16, i16* %p, i64 0 %p1 = getelementptr i16, i16* %p, i64 1 @@ -167,6 +169,8 @@ ret void } +; Prefer paired memops over rotate. + define void @rotate64_in_place(i32* %p) { ; A53-LABEL: rotate64_in_place: ; A53: // %bb.0: @@ -182,6 +186,8 @@ ret void } +; Prefer paired memops over rotate. + define void @rotate64(i32* %p) { ; A53-LABEL: rotate64: ; A53: // %bb.0: diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll --- a/llvm/test/CodeGen/X86/stores-merging.ll +++ b/llvm/test/CodeGen/X86/stores-merging.ll @@ -246,10 +246,7 @@ define void @rotate16_in_place(i8* %p) { ; CHECK-LABEL: rotate16_in_place: ; CHECK: # %bb.0: -; CHECK-NEXT: movb (%rdi), %al -; CHECK-NEXT: movb 1(%rdi), %cl -; CHECK-NEXT: movb %cl, (%rdi) -; CHECK-NEXT: movb %al, 1(%rdi) +; CHECK-NEXT: rolw $8, (%rdi) ; CHECK-NEXT: retq %p0 = getelementptr i8, i8* %p, i64 0 %p1 = getelementptr i8, i8* %p, i64 1 @@ -263,10 +260,9 @@ define void @rotate16(i8* %p, i8* %q) { ; CHECK-LABEL: rotate16: ; CHECK: # %bb.0: -; CHECK-NEXT: movb (%rdi), %al -; CHECK-NEXT: movb 1(%rdi), %cl -; CHECK-NEXT: movb %cl, (%rsi) -; CHECK-NEXT: movb %al, 1(%rsi) +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: rolw $8, %ax +; CHECK-NEXT: movw %ax, (%rsi) ; CHECK-NEXT: retq %p0 = getelementptr i8, i8* %p, i64 0 %p1 = getelementptr i8, i8* %p, i64 1 @@ -282,10 +278,7 @@ define void @rotate32_in_place(i16* %p) { ; CHECK-LABEL: rotate32_in_place: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movw %cx, (%rdi) -; CHECK-NEXT: movw %ax, 2(%rdi) +; CHECK-NEXT: roll $16, (%rdi) ; CHECK-NEXT: retq %p0 = getelementptr i16, i16* %p, i64 0 %p1 = getelementptr i16, i16* %p, i64 1 @@ -299,10 +292,9 @@ define void @rotate32(i16* %p) { ; CHECK-LABEL: rotate32: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movw %cx, 84(%rdi) -; CHECK-NEXT: movw %ax, 86(%rdi) +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: roll $16, %eax +; CHECK-NEXT: movl %eax, 84(%rdi) ; CHECK-NEXT: retq %p0 = getelementptr i16, i16* %p, i64 0 %p1 = getelementptr i16, i16* %p, i64 1 @@ -318,10 +310,7 @@ define void @rotate64_in_place(i32* %p) { ; CHECK-LABEL: rotate64_in_place: ; CHECK: # %bb.0: -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: movl 4(%rdi), %ecx -; CHECK-NEXT: movl %ecx, (%rdi) -; CHECK-NEXT: movl %eax, 4(%rdi) +; CHECK-NEXT: rolq $32, (%rdi) ; CHECK-NEXT: retq %p0 = getelementptr i32, i32* %p, i64 0 %p1 = getelementptr i32, i32* %p, i64 1 @@ -335,10 +324,9 @@ define void @rotate64(i32* %p) { ; CHECK-LABEL: rotate64: ; CHECK: # %bb.0: -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: movl 4(%rdi), %ecx -; CHECK-NEXT: movl %ecx, 8(%rdi) -; CHECK-NEXT: movl %eax, 12(%rdi) +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: rolq $32, %rax +; CHECK-NEXT: movq %rax, 8(%rdi) ; CHECK-NEXT: retq %p0 = getelementptr i32, i32* %p, i64 0 %p1 = getelementptr i32, i32* %p, i64 1 @@ -354,10 +342,9 @@ define void @rotate64_iterate(i16* %p) { ; CHECK-LABEL: rotate64_iterate: ; CHECK: # %bb.0: -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: movl 4(%rdi), %ecx -; CHECK-NEXT: movl %ecx, 84(%rdi) -; CHECK-NEXT: movl %eax, 88(%rdi) +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: rolq $32, %rax +; CHECK-NEXT: movq %rax, 84(%rdi) ; CHECK-NEXT: retq %p0 = getelementptr i16, i16* %p, i64 0 %p1 = getelementptr i16, i16* %p, i64 1 @@ -378,6 +365,8 @@ ret void } +; TODO: recognize this as 2 rotates? + define void @rotate32_consecutive(i16* %p) { ; CHECK-LABEL: rotate32_consecutive: ; CHECK: # %bb.0: @@ -409,17 +398,17 @@ ret void } +; Same as above, but now the stores are not all consecutive. + define void @rotate32_twice(i16* %p) { ; CHECK-LABEL: rotate32_twice: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movzwl 4(%rdi), %edx -; CHECK-NEXT: movzwl 6(%rdi), %esi -; CHECK-NEXT: movw %cx, 84(%rdi) -; CHECK-NEXT: movw %ax, 86(%rdi) -; CHECK-NEXT: movw %si, 108(%rdi) -; CHECK-NEXT: movw %dx, 110(%rdi) +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: movl 4(%rdi), %ecx +; CHECK-NEXT: roll $16, %eax +; CHECK-NEXT: roll $16, %ecx +; CHECK-NEXT: movl %eax, 84(%rdi) +; CHECK-NEXT: movl %ecx, 108(%rdi) ; CHECK-NEXT: retq %p0 = getelementptr i16, i16* %p, i64 0 %p1 = getelementptr i16, i16* %p, i64 1