diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -15787,16 +15787,37 @@ break; case ISD::INTRINSIC_W_CHAIN: - // For little endian, VSX loads require generating lxvd2x/xxswapd. - // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. - if (Subtarget.needsSwapsForVSXMemOps()) { - switch (cast(N->getOperand(1))->getZExtValue()) { - default: - break; - case Intrinsic::ppc_vsx_lxvw4x: - case Intrinsic::ppc_vsx_lxvd2x: - return expandVSXLoadForLE(N, DCI); + switch (cast(N->getOperand(1))->getZExtValue()) { + default: + break; + case Intrinsic::ppc_altivec_vsum4sbs: + case Intrinsic::ppc_altivec_vsum4shs: + case Intrinsic::ppc_altivec_vsum4ubs: { + // These sum-across intrinsics only have a chain due to the side effect + // that they may set the SAT bit. If we know the SAT bit will not be set + // for some inputs, we can replace any uses of their chain with the input + // chain. + if (BuildVectorSDNode *BVN = + dyn_cast(N->getOperand(3))) { + APInt APSplatBits, APSplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + bool BVNIsConstantSplat = BVN->isConstantSplat( + APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0, + !Subtarget.isLittleEndian()); + // If the constant splat vector is 0, the SAT bit will not be set. + if (BVNIsConstantSplat && APSplatBits == 0) + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0)); } + return SDValue(); + } + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: + // For little endian, VSX loads require generating lxvd2x/xxswapd. + // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. + if (Subtarget.needsSwapsForVSXMemOps()) + return expandVSXLoadForLE(N, DCI); + break; } break; case ISD::INTRINSIC_VOID: diff --git a/llvm/test/CodeGen/PowerPC/vector-sum-sat-bit-side-effect.ll b/llvm/test/CodeGen/PowerPC/vector-sum-sat-bit-side-effect.ll --- a/llvm/test/CodeGen/PowerPC/vector-sum-sat-bit-side-effect.ll +++ b/llvm/test/CodeGen/PowerPC/vector-sum-sat-bit-side-effect.ll @@ -9,8 +9,6 @@ define void @test1(<16 x i8> %0) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxlxor v3, v3, v3 -; CHECK-NEXT: vsum4sbs v2, v2, v3 ; CHECK-NEXT: blr entry: %1 = tail call <4 x i32> @llvm.ppc.altivec.vsum4sbs(<16 x i8> %0, <4 x i32> zeroinitializer) @@ -20,8 +18,6 @@ define void @test2(<8 x i16> %0) { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxlxor v3, v3, v3 -; CHECK-NEXT: vsum4shs v2, v2, v3 ; CHECK-NEXT: blr entry: %1 = tail call <4 x i32> @llvm.ppc.altivec.vsum4shs(<8 x i16> %0, <4 x i32> zeroinitializer) @@ -31,8 +27,6 @@ define void @test3(<16 x i8> %0) { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxlxor v3, v3, v3 -; CHECK-NEXT: vsum4ubs v2, v2, v3 ; CHECK-NEXT: blr entry: %1 = tail call <4 x i32> @llvm.ppc.altivec.vsum4ubs(<16 x i8> %0, <4 x i32> zeroinitializer) @@ -108,9 +102,8 @@ define <4 x i32> @test10(<16 x i8> %0, <16 x i8> %1) { ; CHECK-LABEL: test10: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxlxor v4, v4, v4 -; CHECK-NEXT: vsum4sbs v2, v2, v4 -; CHECK-NEXT: vsum4sbs v3, v3, v4 +; CHECK-NEXT: xxlxor v3, v3, v3 +; CHECK-NEXT: vsum4sbs v2, v2, v3 ; CHECK-NEXT: blr entry: %2 = tail call <4 x i32> @llvm.ppc.altivec.vsum4sbs(<16 x i8> %0, <4 x i32> zeroinitializer) @@ -121,9 +114,8 @@ define <4 x i32> @test11(<8 x i16> %0, <8 x i16> %1) { ; CHECK-LABEL: test11: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxlxor v4, v4, v4 -; CHECK-NEXT: vsum4shs v2, v2, v4 -; CHECK-NEXT: vsum4shs v3, v3, v4 +; CHECK-NEXT: xxlxor v3, v3, v3 +; CHECK-NEXT: vsum4shs v2, v2, v3 ; CHECK-NEXT: blr entry: %2 = tail call <4 x i32> @llvm.ppc.altivec.vsum4shs(<8 x i16> %0, <4 x i32> zeroinitializer) @@ -134,9 +126,8 @@ define <4 x i32> @test12(<16 x i8> %0, <16 x i8> %1) { ; CHECK-LABEL: test12: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxlxor v4, v4, v4 -; CHECK-NEXT: vsum4ubs v2, v2, v4 -; CHECK-NEXT: vsum4ubs v3, v3, v4 +; CHECK-NEXT: xxlxor v3, v3, v3 +; CHECK-NEXT: vsum4ubs v2, v2, v3 ; CHECK-NEXT: blr entry: %2 = tail call <4 x i32> @llvm.ppc.altivec.vsum4ubs(<16 x i8> %0, <4 x i32> zeroinitializer)