Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2428,24 +2428,45 @@ /// Returns true if we should use a direct load into vector instruction /// (such as lxsd or lfd), instead of a load into gpr + direct move sequence. -static bool usePartialVectorLoads(SDNode *N) { - if (!N->hasOneUse()) - return false; - +static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) { + // If there are any other uses other than scalar to vector, then we should // keep it as a scalar load -> direct move pattern to prevent multiple - // loads. Currently, only check for i64 since we have lxsd/lfd to do this - // efficiently, but no update equivalent. - if (LoadSDNode *LD = dyn_cast(N)) { - EVT MemVT = LD->getMemoryVT(); - if (MemVT.isSimple() && MemVT.getSimpleVT().SimpleTy == MVT::i64) { - SDNode *User = *(LD->use_begin()); - if (User->getOpcode() == ISD::SCALAR_TO_VECTOR) - return true; - } + // loads. + LoadSDNode *LD = dyn_cast(N); + if (!LD) + return false; + + EVT MemVT = LD->getMemoryVT(); + if (!MemVT.isSimple()) + return false; + switch(MemVT.getSimpleVT().SimpleTy) { + case MVT::i64: + break; + case MVT::i32: + if (!ST.hasP8Vector()) + return false; + break; + case MVT::i16: + case MVT::i8: + if (!ST.hasP9Vector()) + return false; + break; + default: + return false; } - return false; + SDValue LoadedVal(N, 0); + if (!LoadedVal.hasOneUse()) + return false; + + for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); + UI != UE; ++UI) + if (UI.getUse().get().getResNo() == 0 && + UI->getOpcode() != ISD::SCALAR_TO_VECTOR) + return false; + + return true; } /// getPreIndexedAddressParts - returns true by value, base pointer and @@ -2476,7 +2497,7 @@ // Do not generate pre-inc forms for specific loads that feed scalar_to_vector // instructions because we can fold these into a more efficient instruction // instead, (such as LXSD). - if (isLoad && usePartialVectorLoads(N)) { + if (isLoad && usePartialVectorLoads(N, Subtarget)) { return false; } Index: llvm/trunk/test/CodeGen/PowerPC/pre-inc-disable.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/pre-inc-disable.ll +++ llvm/trunk/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mcpu=pwr9 -O3 -verify-machineinstrs -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu \ ; RUN: < %s | FileCheck %s @@ -30,26 +29,25 @@ ; CHECK-NEXT: xvnegsp v0, v1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lfd f0, 0(r3) +; CHECK: lfd f0, 0(r3) ; CHECK-NEXT: xxpermdi v1, f0, f0, 2 -; CHECK-NEXT: vperm v6, v3, v1, v2 -; CHECK-NEXT: vperm v1, v1, v3, v4 -; CHECK-NEXT: xvnegsp v6, v6 +; CHECK-NEXT: vperm v6, v1, v3, v4 +; CHECK-NEXT: vperm v1, v3, v1, v2 ; CHECK-NEXT: xvnegsp v1, v1 -; CHECK-NEXT: vabsduw v6, v6, v5 -; CHECK-NEXT: vabsduw v1, v1, v0 -; CHECK-NEXT: vadduwm v1, v1, v6 +; CHECK-NEXT: xvnegsp v6, v6 +; CHECK-NEXT: vabsduw v1, v1, v5 +; CHECK-NEXT: vabsduw v6, v6, v0 +; CHECK-NEXT: vadduwm v1, v6, v1 ; CHECK-NEXT: xxswapd v6, v1 ; CHECK-NEXT: vadduwm v1, v1, v6 ; CHECK-NEXT: xxspltw v6, v1, 2 ; CHECK-NEXT: vadduwm v1, v1, v6 ; CHECK-NEXT: vextuwrx r7, r5, v1 -; CHECK-NEXT: ldux r8, r3, r4 -; CHECK-NEXT: add r3, r3, r4 +; CHECK-NEXT: lfdx f0, r3, r4 ; CHECK-NEXT: add r6, r7, r6 -; CHECK-NEXT: mtvsrd f0, r8 -; CHECK-NEXT: xxswapd v1, vs0 +; CHECK-NEXT: add r7, r3, r4 +; CHECK-NEXT: xxpermdi v1, f0, f0, 2 +; CHECK-NEXT: add r3, r7, r4 ; CHECK-NEXT: vperm v6, v3, v1, v2 ; CHECK-NEXT: vperm v1, v1, v3, v4 ; CHECK-NEXT: xvnegsp v6, v6 @@ -61,8 +59,8 @@ ; CHECK-NEXT: vadduwm v1, v1, v6 ; CHECK-NEXT: xxspltw v6, v1, 2 ; CHECK-NEXT: vadduwm v1, v1, v6 -; CHECK-NEXT: vextuwrx r7, r5, v1 -; CHECK-NEXT: add r6, r7, r6 +; CHECK-NEXT: vextuwrx r8, r5, v1 +; CHECK-NEXT: add r6, r8, r6 ; CHECK-NEXT: bdnz .LBB0_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: extsw r3, r6 @@ -89,8 +87,7 @@ ; P9BE-NEXT: xvnegsp v0, v1 ; P9BE-NEXT: .p2align 4 ; P9BE-NEXT: .LBB0_1: # %for.cond1.preheader -; P9BE-NEXT: # =>This Inner Loop Header: Depth=1 -; P9BE-NEXT: lfd f0, 0(r3) +; P9BE: lfd f0, 0(r3) ; P9BE-NEXT: xxlor v1, vs0, vs0 ; P9BE-NEXT: vperm v6, v3, v1, v4 ; P9BE-NEXT: vperm v1, v3, v1, v2 @@ -104,10 +101,11 @@ ; P9BE-NEXT: xxspltw v6, v1, 1 ; P9BE-NEXT: vadduwm v1, v1, v6 ; P9BE-NEXT: vextuwlx r7, r5, v1 +; P9BE-NEXT: lfdx f0, r3, r4 ; P9BE-NEXT: add r6, r7, r6 -; P9BE-NEXT: ldux r7, r3, r4 -; P9BE-NEXT: add r3, r3, r4 -; P9BE-NEXT: mtvsrd v1, r7 +; P9BE-NEXT: add r7, r3, r4 +; P9BE-NEXT: xxlor v1, vs0, vs0 +; P9BE-NEXT: add r3, r7, r4 ; P9BE-NEXT: vperm v6, v3, v1, v2 ; P9BE-NEXT: vperm v1, v3, v1, v4 ; P9BE-NEXT: xvnegsp v6, v6 @@ -119,8 +117,8 @@ ; P9BE-NEXT: vadduwm v1, v1, v6 ; P9BE-NEXT: xxspltw v6, v1, 1 ; P9BE-NEXT: vadduwm v1, v1, v6 -; P9BE-NEXT: vextuwlx r7, r5, v1 -; P9BE-NEXT: add r6, r7, r6 +; P9BE-NEXT: vextuwlx r8, r5, v1 +; P9BE-NEXT: add r6, r8, r6 ; P9BE-NEXT: bdnz .LBB0_1 ; P9BE-NEXT: # %bb.2: # %for.cond.cleanup ; P9BE-NEXT: extsw r3, r6 @@ -281,3 +279,119 @@ ; return i_sum; ;} +define void @test32(i8* nocapture readonly %pix2, i32 signext %i_pix2) { +entry: + %idx.ext63 = sext i32 %i_pix2 to i64 + %add.ptr64 = getelementptr inbounds i8, i8* %pix2, i64 %idx.ext63 + %arrayidx5.1 = getelementptr inbounds i8, i8* %add.ptr64, i64 4 + %0 = bitcast i8* %add.ptr64 to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %reorder_shuffle117 = shufflevector <4 x i8> %1, <4 x i8> undef, <4 x i32> + %2 = zext <4 x i8> %reorder_shuffle117 to <4 x i32> + %3 = sub nsw <4 x i32> zeroinitializer, %2 + %4 = bitcast i8* %arrayidx5.1 to <4 x i8>* + %5 = load <4 x i8>, <4 x i8>* %4, align 1 + %reorder_shuffle115 = shufflevector <4 x i8> %5, <4 x i8> undef, <4 x i32> + %6 = zext <4 x i8> %reorder_shuffle115 to <4 x i32> + %7 = sub nsw <4 x i32> zeroinitializer, %6 + %8 = shl nsw <4 x i32> %7, + %9 = add nsw <4 x i32> %8, %3 + %10 = sub nsw <4 x i32> %9, zeroinitializer + %11 = shufflevector <4 x i32> undef, <4 x i32> %10, <4 x i32> + %12 = add nsw <4 x i32> zeroinitializer, %11 + %13 = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> + store <4 x i32> %13, <4 x i32>* undef, align 16 + ret void +; CHECK-LABEL: test32: +; CHECK-NOT: lwzux +; CHECK-NOT: mtvsrws +; CHECK: lfiwzx +; CHECK: lfiwzx +; P9BE-CHECK-LABEL: test32: +; P9BE-CHECK-NOT: lwzux +; P9BE-CHECK-NOT: mtvsrws +; P9BE-CHECK: lfiwzx +; P9BE-CHECK: lfiwzx +} + +define void @test16(i16* nocapture readonly %sums, i32 signext %delta, i32 signext %thresh) { +entry: + %idxprom = sext i32 %delta to i64 + %add14 = add nsw i32 %delta, 8 + %idxprom15 = sext i32 %add14 to i64 + br label %for.body + +for.body: ; preds = %entry + %arrayidx8 = getelementptr inbounds i16, i16* %sums, i64 %idxprom + %0 = load i16, i16* %arrayidx8, align 2 + %arrayidx16 = getelementptr inbounds i16, i16* %sums, i64 %idxprom15 + %1 = load i16, i16* %arrayidx16, align 2 + %2 = insertelement <4 x i16> undef, i16 %0, i32 2 + %3 = insertelement <4 x i16> %2, i16 %1, i32 3 + %4 = zext <4 x i16> %3 to <4 x i32> + %5 = sub nsw <4 x i32> zeroinitializer, %4 + %6 = sub nsw <4 x i32> zeroinitializer, %5 + %7 = select <4 x i1> undef, <4 x i32> %6, <4 x i32> %5 + %bin.rdx = add <4 x i32> %7, zeroinitializer + %rdx.shuf54 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> + %bin.rdx55 = add <4 x i32> %bin.rdx, %rdx.shuf54 + %8 = extractelement <4 x i32> %bin.rdx55, i32 0 + %op.extra = add nuw i32 %8, 0 + %cmp25 = icmp slt i32 %op.extra, %thresh + br i1 %cmp25, label %if.then, label %if.end + +if.then: ; preds = %for.body + unreachable + +if.end: ; preds = %for.body + ret void +; CHECK-LABEL: test16: +; CHECK-NOT: lhzux +; CHECK: lxsihzx +; CHECK: lxsihzx +; P9BE-CHECK-LABEL: test16: +; P9BE-CHECK-NOT: lhzux +; P9BE-CHECK: lxsihzx +; P9BE-CHECK: lxsihzx +} + +define void @test8(i8* nocapture readonly %sums, i32 signext %delta, i32 signext %thresh) { +entry: + %idxprom = sext i32 %delta to i64 + %add14 = add nsw i32 %delta, 8 + %idxprom15 = sext i32 %add14 to i64 + br label %for.body + +for.body: ; preds = %entry + %arrayidx8 = getelementptr inbounds i8, i8* %sums, i64 %idxprom + %0 = load i8, i8* %arrayidx8, align 2 + %arrayidx16 = getelementptr inbounds i8, i8* %sums, i64 %idxprom15 + %1 = load i8, i8* %arrayidx16, align 2 + %2 = insertelement <4 x i8> undef, i8 %0, i32 2 + %3 = insertelement <4 x i8> %2, i8 %1, i32 3 + %4 = zext <4 x i8> %3 to <4 x i32> + %5 = sub nsw <4 x i32> zeroinitializer, %4 + %6 = sub nsw <4 x i32> zeroinitializer, %5 + %7 = select <4 x i1> undef, <4 x i32> %6, <4 x i32> %5 + %bin.rdx = add <4 x i32> %7, zeroinitializer + %rdx.shuf54 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> + %bin.rdx55 = add <4 x i32> %bin.rdx, %rdx.shuf54 + %8 = extractelement <4 x i32> %bin.rdx55, i32 0 + %op.extra = add nuw i32 %8, 0 + %cmp25 = icmp slt i32 %op.extra, %thresh + br i1 %cmp25, label %if.then, label %if.end + +if.then: ; preds = %for.body + unreachable + +if.end: ; preds = %for.body + ret void +; CHECK-LABEL: test8: +; CHECK-NOT: lbzux +; CHECK: lxsibzx +; CHECK: lxsibzx +; P9BE-CHECK-LABEL: test8: +; P9BE-CHECK-NOT: lbzux +; P9BE-CHECK: lxsibzx +; P9BE-CHECK: lxsibzx +}