Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -1096,6 +1096,11 @@ // tail call. This will cause the optimizers to attempt to move, or // duplicate return instructions to help enable tail call optimizations. bool mayBeEmittedAsTailCall(const CallInst *CI) const override; + + // If the input vector will require a direct-move to extract the element + // but the store can be combined into PPC::STIWX, we want to combine it. + bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const override; }; // end class PPCTargetLowering namespace PPC { Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -13940,3 +13940,35 @@ // If the function is local then we have a good chance at tail-calling it return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); } + +bool PPCTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const { + if (!Subtarget.hasDirectMove() || !Subtarget.hasAltivec() || + !Subtarget.hasVSX()) + return false; + + // If the index is unknown at compile time, this is very expensive to lower + // and it is not possible to combine the store with the extract. + ConstantInt *CI = dyn_cast(Idx); + if (!CI) + return false; + + assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); + unsigned BitWidth = VectorTy->getScalarSizeInBits(); + + // Only have combined stores for sub-word types on Power9. + if (BitWidth > 32 || (!Subtarget.hasP9Vector() && BitWidth != 32)) + return false; + + uint64_t CIdx = CI->getZExtValue(); + uint64_t NaturalIdx = -1UL; + switch (BitWidth) { + default: return false; + case 8: NaturalIdx = Subtarget.isLittleEndian() ? 8 : 7; break; + case 16: NaturalIdx = Subtarget.isLittleEndian() ? 4 : 3; break; + case 32: NaturalIdx = Subtarget.isLittleEndian() ? 2 : 1; break; + } + + Cost = CIdx == NaturalIdx ? 0 : 3; + return true; +} Index: lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- lib/Target/PowerPC/PPCInstrVSX.td +++ lib/Target/PowerPC/PPCInstrVSX.td @@ -1431,7 +1431,7 @@ "xscvspdpn $XT, $XB", IIC_VecFP, []>; } // UseVSXReg = 1 - let Predicates = [IsLittleEndian] in { + let Predicates = [HasP8Vector, IsLittleEndian] in { def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))), (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; @@ -1446,9 +1446,13 @@ (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))), (f32 (XSCVUXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>; + def : Pat<(store (i32 (extractelt v4i32:$A, 2)), xoaddr:$src), + (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + def : Pat<(store (f32 (extractelt v4f32:$A, 2)), xoaddr:$src), + (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; } - let Predicates = [IsBigEndian] in { + let Predicates = [HasP8Vector, IsBigEndian] in { def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 0)))))), (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S, VSFRC)))>; @@ -1461,6 +1465,10 @@ def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))), (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>; + def : Pat<(store (i32 (extractelt v4i32:$A, 1)), xoaddr:$src), + (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; + def : Pat<(store (f32 (extractelt v4f32:$A, 1)), xoaddr:$src), + (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; } def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)), (v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>; Index: test/CodeGen/PowerPC/combine-extract-store.ll =================================================================== --- test/CodeGen/PowerPC/combine-extract-store.ll +++ test/CodeGen/PowerPC/combine-extract-store.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unkknown-unknown \ +; RUN: -verify-machineinstrs -O2 < %s | FileCheck %s +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unkknown-unknown \ +; RUN: -verify-machineinstrs -O2 < %s | FileCheck %s --check-prefix=CHECK-BE + +; Function Attrs: norecurse nounwind +define void @test(<4 x i32>* noalias nocapture readonly %VP, <4 x i32>* noalias nocapture %VP2, i32* noalias nocapture %IP) local_unnamed_addr #0 { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vspltisw 2, 4 +; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: vadduwm 2, 3, 2 +; CHECK-NEXT: stxsiwx 34, 0, 5 +; CHECK-NEXT: stvx 3, 0, 4 +; CHECK-NEXT: blr +entry: + %0 = load <4 x i32>, <4 x i32>* %VP, align 16 + %vecext = extractelement <4 x i32> %0, i32 2 + %add = add nsw i32 %vecext, 4 + store i32 %add, i32* %IP, align 4 + store <4 x i32> %0, <4 x i32>* %VP2, align 16 + ret void +} + +; Function Attrs: norecurse nounwind +define void @testf(<4 x float>* noalias nocapture readonly %VP, <4 x float>* noalias nocapture %VP2, float* noalias nocapture %IP) local_unnamed_addr #0 { +; CHECK-LABEL: testf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis 6, 2, .LCPI1_0@toc@ha +; CHECK-NEXT: lvx 2, 0, 3 +; CHECK-NEXT: addi 6, 6, .LCPI1_0@toc@l +; CHECK-NEXT: lvx 3, 0, 6 +; CHECK-NEXT: stvx 2, 0, 4 +; CHECK-NEXT: xvaddsp 0, 34, 35 +; CHECK-NEXT: stfiwx 0, 0, 5 +; CHECK-NEXT: blr +entry: + %0 = load <4 x float>, <4 x float>* %VP, align 16 + %vecext = extractelement <4 x float> %0, i32 2 + %add = fadd float %vecext, 4.000000e+00 + store float %add, float* %IP, align 4 + store <4 x float> %0, <4 x float>* %VP2, align 16 + ret void +} + +; Function Attrs: norecurse nounwind +define void @testBE(<4 x i32>* noalias nocapture readonly %VP, <4 x i32>* noalias nocapture %VP2, i32* noalias nocapture %IP) local_unnamed_addr #0 { +; CHECK-BE-LABEL: testBE: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: vspltisw 2, 4 +; CHECK-BE-NEXT: lxvw4x 35, 0, 3 +; CHECK-BE-NEXT: vadduwm 2, 3, 2 +; CHECK-BE-NEXT: stxsiwx 34, 0, 5 +; CHECK-BE-NEXT: stxvw4x 35, 0, 4 +; CHECK-BE-NEXT: blr +entry: + %0 = load <4 x i32>, <4 x i32>* %VP, align 16 + %vecext = extractelement <4 x i32> %0, i32 1 + %add = add nsw i32 %vecext, 4 + store i32 %add, i32* %IP, align 4 + store <4 x i32> %0, <4 x i32>* %VP2, align 16 + ret void +} + +; Function Attrs: norecurse nounwind +define void @testBEf(<4 x float>* noalias nocapture readonly %VP, <4 x float>* noalias nocapture %VP2, float* noalias nocapture %IP) local_unnamed_addr #0 { +; CHECK-BE-LABEL: testBEf: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: addis 6, 2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: lxvw4x 0, 0, 3 +; CHECK-BE-NEXT: addi 6, 6, .LCPI3_0@toc@l +; CHECK-BE-NEXT: lxvw4x 1, 0, 6 +; CHECK-BE-NEXT: stxvw4x 0, 0, 4 +; CHECK-BE-NEXT: xvaddsp 1, 0, 1 +; CHECK-BE-NEXT: stfiwx 1, 0, 5 +; CHECK-BE-NEXT: blr +entry: + %0 = load <4 x float>, <4 x float>* %VP, align 16 + %vecext = extractelement <4 x float> %0, i32 1 + %add = fadd float %vecext, 4.000000e+00 + store float %add, float* %IP, align 4 + store <4 x float> %0, <4 x float>* %VP2, align 16 + ret void +}