diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8257,6 +8257,43 @@ return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt, SDLoc(N), ShAmtTy)); + + // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive. + // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive. + // TODO - bigendian support once we have test coverage. + // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine? + if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() && + !DAG.getDataLayout().isBigEndian()) { + auto *LHS = dyn_cast(N0); + auto *RHS = dyn_cast(N1); + if (LHS && RHS && LHS->isSimple() && RHS->isSimple() && + LHS->getAddressSpace() == RHS->getAddressSpace() && + (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS)) { + if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) { + SDLoc DL(RHS); + uint64_t PtrOff = + IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8); + unsigned NewAlign = MinAlign(RHS->getAlignment(), PtrOff); + bool Fast = false; + if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + RHS->getAddressSpace(), NewAlign, + RHS->getMemOperand()->getFlags(), &Fast) && + Fast) { + SDValue NewPtr = + DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL); + AddToWorklist(NewPtr.getNode()); + SDValue Load = DAG.getLoad( + VT, DL, RHS->getChain(), NewPtr, + RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign, + RHS->getMemOperand()->getFlags(), RHS->getAAInfo()); + // Replace the old load's chain with the new load's chain. + WorklistRemover DeadNodes(*this); + DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1)); + return Load; + } + } + } + } } // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2) diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -547,39 +547,16 @@ } define i16 @combine_fshl_load_i16(i16* %p) nounwind { -; X86-FAST-LABEL: combine_fshl_load_i16: -; X86-FAST: # %bb.0: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movzwl (%eax), %ecx -; X86-FAST-NEXT: movzwl 2(%eax), %eax -; X86-FAST-NEXT: shldw $8, %cx, %ax -; X86-FAST-NEXT: retl -; -; X86-SLOW-LABEL: combine_fshl_load_i16: -; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movzwl 2(%ecx), %eax -; X86-SLOW-NEXT: movzbl 1(%ecx), %ecx -; X86-SLOW-NEXT: shll $8, %eax -; X86-SLOW-NEXT: orl %ecx, %eax -; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; X86-SLOW-NEXT: retl -; -; X64-FAST-LABEL: combine_fshl_load_i16: -; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movzwl (%rdi), %ecx -; X64-FAST-NEXT: movzwl 2(%rdi), %eax -; X64-FAST-NEXT: shldw $8, %cx, %ax -; X64-FAST-NEXT: retq +; X86-LABEL: combine_fshl_load_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl 1(%eax), %eax +; X86-NEXT: retl ; -; X64-SLOW-LABEL: combine_fshl_load_i16: -; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movzwl 2(%rdi), %eax -; X64-SLOW-NEXT: movzbl 1(%rdi), %ecx -; X64-SLOW-NEXT: shll $8, %eax -; X64-SLOW-NEXT: orl %ecx, %eax -; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; X64-SLOW-NEXT: retq +; X64-LABEL: combine_fshl_load_i16: +; X64: # %bb.0: +; X64-NEXT: movzwl 1(%rdi), %eax +; X64-NEXT: retq %p0 = getelementptr i16, i16* %p, i32 0 %p1 = getelementptr i16, i16* %p, i32 1 %ld0 = load i16, i16 *%p0 @@ -589,31 +566,16 @@ } define i32 @combine_fshl_load_i32(i32* %p) nounwind { -; X86-FAST-LABEL: combine_fshl_load_i32: -; X86-FAST: # %bb.0: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl 8(%eax), %ecx -; X86-FAST-NEXT: movl 12(%eax), %eax -; X86-FAST-NEXT: shldl $8, %ecx, %eax -; X86-FAST-NEXT: retl -; -; X86-SLOW-LABEL: combine_fshl_load_i32: -; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl 11(%eax), %eax -; X86-SLOW-NEXT: retl -; -; X64-FAST-LABEL: combine_fshl_load_i32: -; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movl 8(%rdi), %ecx -; X64-FAST-NEXT: movl 12(%rdi), %eax -; X64-FAST-NEXT: shldl $8, %ecx, %eax -; X64-FAST-NEXT: retq +; X86-LABEL: combine_fshl_load_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl 11(%eax), %eax +; X86-NEXT: retl ; -; X64-SLOW-LABEL: combine_fshl_load_i32: -; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movl 11(%rdi), %eax -; X64-SLOW-NEXT: retq +; X64-LABEL: combine_fshl_load_i32: +; X64: # %bb.0: +; X64-NEXT: movl 11(%rdi), %eax +; X64-NEXT: retq %p0 = getelementptr i32, i32* %p, i32 2 %p1 = getelementptr i32, i32* %p, i32 3 %ld0 = load i32, i32 *%p0 @@ -652,21 +614,10 @@ ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: retl ; -; X64-FAST-LABEL: combine_fshl_load_i64: -; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movq 8(%rdi), %rcx -; X64-FAST-NEXT: movq 16(%rdi), %rax -; X64-FAST-NEXT: shldq $24, %rcx, %rax -; X64-FAST-NEXT: retq -; -; X64-SLOW-LABEL: combine_fshl_load_i64: -; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movq 8(%rdi), %rcx -; X64-SLOW-NEXT: movq 16(%rdi), %rax -; X64-SLOW-NEXT: shrq $40, %rcx -; X64-SLOW-NEXT: shlq $24, %rax -; X64-SLOW-NEXT: orq %rcx, %rax -; X64-SLOW-NEXT: retq +; X64-LABEL: combine_fshl_load_i64: +; X64: # %bb.0: +; X64-NEXT: movq 13(%rdi), %rax +; X64-NEXT: retq %p0 = getelementptr i64, i64* %p, i64 1 %p1 = getelementptr i64, i64* %p, i64 2 %ld0 = load i64, i64 *%p0 diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -542,39 +542,16 @@ } define i16 @combine_fshr_load_i16(i16* %p) nounwind { -; X86-FAST-LABEL: combine_fshr_load_i16: -; X86-FAST: # %bb.0: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movzwl (%eax), %ecx -; X86-FAST-NEXT: movzwl 2(%eax), %eax -; X86-FAST-NEXT: shldw $8, %cx, %ax -; X86-FAST-NEXT: retl -; -; X86-SLOW-LABEL: combine_fshr_load_i16: -; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movzwl 2(%ecx), %eax -; X86-SLOW-NEXT: movzbl 1(%ecx), %ecx -; X86-SLOW-NEXT: shll $8, %eax -; X86-SLOW-NEXT: orl %ecx, %eax -; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; X86-SLOW-NEXT: retl -; -; X64-FAST-LABEL: combine_fshr_load_i16: -; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movzwl (%rdi), %ecx -; X64-FAST-NEXT: movzwl 2(%rdi), %eax -; X64-FAST-NEXT: shldw $8, %cx, %ax -; X64-FAST-NEXT: retq +; X86-LABEL: combine_fshr_load_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl 1(%eax), %eax +; X86-NEXT: retl ; -; X64-SLOW-LABEL: combine_fshr_load_i16: -; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movzwl 2(%rdi), %eax -; X64-SLOW-NEXT: movzbl 1(%rdi), %ecx -; X64-SLOW-NEXT: shll $8, %eax -; X64-SLOW-NEXT: orl %ecx, %eax -; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; X64-SLOW-NEXT: retq +; X64-LABEL: combine_fshr_load_i16: +; X64: # %bb.0: +; X64-NEXT: movzwl 1(%rdi), %eax +; X64-NEXT: retq %p0 = getelementptr i16, i16* %p, i32 0 %p1 = getelementptr i16, i16* %p, i32 1 %ld0 = load i16, i16 *%p0 @@ -584,39 +561,16 @@ } define i32 @combine_fshr_load_i32(i32* %p) nounwind { -; X86-FAST-LABEL: combine_fshr_load_i32: -; X86-FAST: # %bb.0: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl 8(%eax), %ecx -; X86-FAST-NEXT: movl 12(%eax), %eax -; X86-FAST-NEXT: shldl $24, %ecx, %eax -; X86-FAST-NEXT: retl -; -; X86-SLOW-LABEL: combine_fshr_load_i32: -; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl 8(%eax), %ecx -; X86-SLOW-NEXT: movl 12(%eax), %eax -; X86-SLOW-NEXT: shrl $8, %ecx -; X86-SLOW-NEXT: shll $24, %eax -; X86-SLOW-NEXT: orl %ecx, %eax -; X86-SLOW-NEXT: retl -; -; X64-FAST-LABEL: combine_fshr_load_i32: -; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movl 8(%rdi), %ecx -; X64-FAST-NEXT: movl 12(%rdi), %eax -; X64-FAST-NEXT: shldl $24, %ecx, %eax -; X64-FAST-NEXT: retq +; X86-LABEL: combine_fshr_load_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl 9(%eax), %eax +; X86-NEXT: retl ; -; X64-SLOW-LABEL: combine_fshr_load_i32: -; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movl 8(%rdi), %ecx -; X64-SLOW-NEXT: movl 12(%rdi), %eax -; X64-SLOW-NEXT: shrl $8, %ecx -; X64-SLOW-NEXT: shll $24, %eax -; X64-SLOW-NEXT: orl %ecx, %eax -; X64-SLOW-NEXT: retq +; X64-LABEL: combine_fshr_load_i32: +; X64: # %bb.0: +; X64-NEXT: movl 9(%rdi), %eax +; X64-NEXT: retq %p0 = getelementptr i32, i32* %p, i32 2 %p1 = getelementptr i32, i32* %p, i32 3 %ld0 = load i32, i32 *%p0 @@ -656,21 +610,10 @@ ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: retl ; -; X64-FAST-LABEL: combine_fshr_load_i64: -; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movq 8(%rdi), %rcx -; X64-FAST-NEXT: movq 16(%rdi), %rax -; X64-FAST-NEXT: shldq $40, %rcx, %rax -; X64-FAST-NEXT: retq -; -; X64-SLOW-LABEL: combine_fshr_load_i64: -; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: movq 8(%rdi), %rcx -; X64-SLOW-NEXT: movq 16(%rdi), %rax -; X64-SLOW-NEXT: shrq $24, %rcx -; X64-SLOW-NEXT: shlq $40, %rax -; X64-SLOW-NEXT: orq %rcx, %rax -; X64-SLOW-NEXT: retq +; X64-LABEL: combine_fshr_load_i64: +; X64: # %bb.0: +; X64-NEXT: movq 11(%rdi), %rax +; X64-NEXT: retq %p0 = getelementptr i64, i64* %p, i64 1 %p1 = getelementptr i64, i64* %p, i64 2 %ld0 = load i64, i64 *%p0