Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6553,11 +6553,24 @@ return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); } - // Handle X+C - if (isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc && - cast(Loc.getOperand(1))->getSExtValue() == Dist*Bytes) - return true; - + // Handle X + C. + if (isBaseWithConstantOffset(Loc)) { + int LocOffset = cast(Loc.getOperand(1))->getSExtValue(); + if (Loc.getOperand(0) == BaseLoc) { + // If the base location is a simple address with no offset itself, then + // the second load's first add operand should be the base address. + if (LocOffset == Dist * (signed)Bytes) + return true; + } else if (isBaseWithConstantOffset(BaseLoc)) { + // The base location itself has an offset, so subtract that value from the + // second load's offset before comparing to distance * size. + int BOffset = cast(BaseLoc.getOperand(1))->getSExtValue(); + if (Loc.getOperand(0) == BaseLoc.getOperand(0)) { + if ((LocOffset - BOffset) == Dist * (signed)Bytes) + return true; + } + } + } const GlobalValue *GV1 = nullptr; const GlobalValue *GV2 = nullptr; int64_t Offset1 = 0; Index: test/CodeGen/X86/chain_order.ll =================================================================== --- test/CodeGen/X86/chain_order.ll +++ test/CodeGen/X86/chain_order.ll @@ -1,13 +1,13 @@ ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-linux | FileCheck %s -;CHECK-LABEL: cftx020: -;CHECK: vmovsd (%rdi), %xmm{{.*}} -;CHECK: vmovsd 16(%rdi), %xmm{{.*}} -;CHECK: vmovsd 24(%rdi), %xmm{{.*}} -;CHECK: vmovhpd 8(%rdi), %xmm{{.*}} -;CHECK: vmovupd %xmm{{.*}}, (%rdi) -;CHECK: vmovupd %xmm{{.*}}, 16(%rdi) -;CHECK: ret +; CHECK-LABEL: cftx020: +; CHECK: vmovsd (%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovsd 16(%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovhpd 24(%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovhpd 8(%rdi), %xmm{{.*}} +; CHECK: vmovupd %xmm{{.*}}, (%rdi) +; CHECK-NEXT: vmovupd %xmm{{.*}}, 16(%rdi) +; CHECK: ret ; A test from pifft (after SLP-vectorization) that fails when we drop the chain on newly merged loads. define void @cftx020(double* nocapture %a) { Index: test/CodeGen/X86/vec_loadsingles.ll =================================================================== --- test/CodeGen/X86/vec_loadsingles.ll +++ test/CodeGen/X86/vec_loadsingles.ll @@ -89,7 +89,7 @@ ; FAST32-NEXT: retq ; SLOW32: vmovups -; SLOW32: vinsertf128 +; SLOW32-NEXT: vinsertf128 ; SLOW32-NEXT: retq } @@ -112,7 +112,34 @@ ; FAST32-NEXT: retq ; SLOW32: vmovups -; SLOW32: vinsertf128 +; SLOW32-NEXT: vinsertf128 +; SLOW32-NEXT: retq +} + +; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ) +; Recognize and combine consecutive loads even when the +; first of the combined loads is offset from the base address. +define <4 x double> @merge_4_doubles_offset(double* %ptr) { + %arrayidx4 = getelementptr inbounds double* %ptr, i64 4 + %arrayidx5 = getelementptr inbounds double* %ptr, i64 5 + %arrayidx6 = getelementptr inbounds double* %ptr, i64 6 + %arrayidx7 = getelementptr inbounds double* %ptr, i64 7 + %e = load double* %arrayidx4, align 8 + %f = load double* %arrayidx5, align 8 + %g = load double* %arrayidx6, align 8 + %h = load double* %arrayidx7, align 8 + %vecinit4 = insertelement <4 x double> undef, double %e, i32 0 + %vecinit5 = insertelement <4 x double> %vecinit4, double %f, i32 1 + %vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2 + %vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3 + ret <4 x double> %vecinit7 + +; ALL-LABEL: merge_4_doubles_offset +; FAST32: vmovups +; FAST32-NEXT: retq + +; SLOW32: vmovups +; SLOW32-NEXT: vinsertf128 ; SLOW32-NEXT: retq }