diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -940,6 +940,14 @@ return VT.isSimple() && RegClassForVT[VT.getSimpleVT().SimpleTy] != nullptr; } + /// Return true if the target has native support for the specified value type. + /// Provide opportunity for target to decide before type is legalized. On PPC + /// for example, there is efficient pattern to do two vector extracts and + /// store into consecutive memory locations \p BeforeTypeLegalized. + virtual bool isTypeLegalForMemAccess(EVT VT, bool BeforeTypeLegalized) const { + return isTypeLegal(VT); + } + class ValueTypeActionImpl { /// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum /// that indicates how instruction selection should deal with the type. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18473,7 +18473,7 @@ if (Ty.getSizeInBits() > MaximumLegalStoreInBits) break; - if (TLI.isTypeLegal(Ty) && + if (TLI.isTypeLegalForMemAccess(Ty, Level == BeforeLegalizeTypes) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG.getMachineFunction()) && TLI.allowsMemoryAccess(Context, DL, Ty, *FirstInChain->getMemOperand(), &IsFast) && diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1062,6 +1062,15 @@ EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override; + /// Two vector extracts and store into consecutive memory locations is + /// allowed \p BeforeTypeLegalized. + bool isTypeLegalForMemAccess(EVT VT, + bool BeforeTypeLegalized) const override { + return TargetLoweringBase::isTypeLegalForMemAccess(VT, + BeforeTypeLegalized) || + (BeforeTypeLegalized && (VT == MVT::v2i32 || VT == MVT::v2f32)); + } + /// Is unaligned memory access allowed for the given type, and is it fast /// relative to software emulation. bool allowsMisalignedMemoryAccesses( diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -16674,8 +16674,8 @@ if (VT.getSimpleVT().isVector()) { if (Subtarget.hasVSX()) { - if (VT != MVT::v2f64 && VT != MVT::v2i64 && - VT != MVT::v4f32 && VT != MVT::v4i32) + if (VT != MVT::v2f64 && VT != MVT::v2i64 && VT != MVT::v4f32 && + VT != MVT::v4i32 && VT != MVT::v2f32 && VT != MVT::v2i32) return false; } else { return false; diff --git a/llvm/test/CodeGen/PowerPC/extract-and-store.ll b/llvm/test/CodeGen/PowerPC/extract-and-store.ll --- a/llvm/test/CodeGen/PowerPC/extract-and-store.ll +++ b/llvm/test/CodeGen/PowerPC/extract-and-store.ll @@ -482,36 +482,34 @@ define dso_local void @test_consecutive_i32(<4 x i32> %a, i32* nocapture %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_consecutive_i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: stxsiwx vs34, r5, r3 -; CHECK-NEXT: stfiwx f0, 0, r5 +; CHECK-NEXT: vpkudum v2, v2, v2 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r5 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_consecutive_i32: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-BE-NEXT: xxsldwi vs1, vs34, vs34, 1 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: stfiwx f0, 0, r5 -; CHECK-BE-NEXT: stfiwx f1, r5, r3 +; CHECK-BE-NEXT: addis r3, r2, .LCPI14_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI14_0@toc@l +; CHECK-BE-NEXT: lxvw4x vs35, 0, r3 +; CHECK-BE-NEXT: vperm v2, v2, v2, v3 +; CHECK-BE-NEXT: stxsdx vs34, 0, r5 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_consecutive_i32: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stxsiwx vs34, r5, r3 -; CHECK-P9-NEXT: stfiwx f0, 0, r5 +; CHECK-P9-NEXT: vpkudum v2, v2, v2 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_consecutive_i32: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stfiwx f0, 0, r5 -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI14_0@toc@ha +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI14_0@toc@l +; CHECK-P9-BE-NEXT: lxv vs35, 0(r3) +; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r5) ; CHECK-P9-BE-NEXT: blr entry: @@ -526,36 +524,35 @@ define dso_local void @test_consecutive_float(<4 x float> %a, float* nocapture %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_consecutive_float: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 3 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: stfiwx f0, 0, r5 -; CHECK-NEXT: stfiwx f1, r5, r3 +; CHECK-NEXT: addis r3, r2, .LCPI15_0@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI15_0@toc@l +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: xxswapd vs35, vs0 +; CHECK-NEXT: vperm v2, v2, v2, v3 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r5 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_consecutive_float: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: stxsiwx vs34, 0, r5 -; CHECK-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-BE-NEXT: vpkudum v2, v2, v2 +; CHECK-BE-NEXT: stxsdx vs34, 0, r5 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_consecutive_float: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stfiwx f0, 0, r5 -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI15_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI15_0@toc@l +; CHECK-P9-NEXT: lxv vs35, 0(r3) +; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_consecutive_float: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stxsiwx vs34, 0, r5 -; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-NEXT: vpkudum v2, v2, v2 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r5) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x float> %a, i32 1 @@ -570,59 +567,50 @@ ; CHECK-LABEL: test_stores_exceed_vec_size: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 1 -; CHECK-NEXT: li r4, 20 ; CHECK-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-NEXT: lxvd2x vs0, 0, r3 -; CHECK-NEXT: li r3, 16 ; CHECK-NEXT: xxswapd vs35, vs0 ; CHECK-NEXT: vperm v3, v2, v2, v3 +; CHECK-NEXT: vsldoi v2, v2, v2, 12 ; CHECK-NEXT: xxswapd vs0, vs35 +; CHECK-NEXT: xxswapd vs1, vs34 ; CHECK-NEXT: stxvd2x vs0, 0, r5 -; CHECK-NEXT: stfiwx f1, r5, r3 -; CHECK-NEXT: stxsiwx vs34, r5, r4 +; CHECK-NEXT: stfd f1, 16(r5) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-BE-NEXT: li r4, 20 ; CHECK-BE-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-BE-NEXT: lxvw4x vs35, 0, r3 ; CHECK-BE-NEXT: li r3, 16 -; CHECK-BE-NEXT: stxsiwx vs34, r5, r3 -; CHECK-BE-NEXT: stfiwx f0, r5, r4 ; CHECK-BE-NEXT: vperm v3, v2, v2, v3 +; CHECK-BE-NEXT: vsldoi v2, v2, v2, 4 ; CHECK-BE-NEXT: stxvw4x vs35, 0, r5 +; CHECK-BE-NEXT: stxsdx vs34, r5, r3 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_stores_exceed_vec_size: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-P9-NEXT: lxv vs35, 0(r3) -; CHECK-P9-NEXT: li r3, 16 -; CHECK-P9-NEXT: stfiwx f0, r5, r3 -; CHECK-P9-NEXT: li r3, 20 -; CHECK-P9-NEXT: stxsiwx vs34, r5, r3 ; CHECK-P9-NEXT: vperm v3, v2, v2, v3 +; CHECK-P9-NEXT: vsldoi v2, v2, v2, 12 +; CHECK-P9-NEXT: xxswapd vs0, vs34 ; CHECK-P9-NEXT: stxv vs35, 0(r5) +; CHECK-P9-NEXT: stfd f0, 16(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-P9-BE-NEXT: lxv vs35, 0(r3) -; CHECK-P9-BE-NEXT: li r3, 16 -; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3 -; CHECK-P9-BE-NEXT: li r3, 20 -; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 ; CHECK-P9-BE-NEXT: vperm v3, v2, v2, v3 +; CHECK-P9-BE-NEXT: vsldoi v2, v2, v2, 4 ; CHECK-P9-BE-NEXT: stxv vs35, 0(r5) +; CHECK-P9-BE-NEXT: stxsd v2, 16(r5) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 2 @@ -924,36 +912,41 @@ define void @test_elements_from_two_vec(<4 x i32> %a, <4 x i32> %b, i32* nocapture %c) local_unnamed_addr #0 { ; CHECK-LABEL: test_elements_from_two_vec: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-NEXT: xxsldwi vs1, vs35, vs35, 1 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: stfiwx f0, r7, r3 -; CHECK-NEXT: stfiwx f1, 0, r7 +; CHECK-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: xxswapd vs36, vs0 +; CHECK-NEXT: vperm v2, v2, v3, v4 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r7 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_elements_from_two_vec: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: stxsiwx vs35, 0, r7 -; CHECK-BE-NEXT: stfiwx f0, r7, r3 +; CHECK-BE-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-BE-NEXT: lxvw4x vs36, 0, r3 +; CHECK-BE-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-NEXT: stxsdx vs34, 0, r7 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_elements_from_two_vec: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stfiwx f0, r7, r3 -; CHECK-P9-NEXT: xxsldwi vs0, vs35, vs35, 1 -; CHECK-P9-NEXT: stfiwx f0, 0, r7 +; CHECK-P9-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-P9-NEXT: lxv vs36, 0(r3) +; CHECK-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r7) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_elements_from_two_vec: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stxsiwx vs35, 0, r7 -; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3 +; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-P9-BE-NEXT: lxv vs36, 0(r3) +; CHECK-P9-BE-NEXT: vperm v2, v3, v2, v4 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r7) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 0 @@ -967,46 +960,51 @@ define dso_local void @test_elements_from_three_vec(<4 x float> %a, <4 x float> %b, <4 x float> %c, float* nocapture %d) local_unnamed_addr #0 { ; CHECK-LABEL: test_elements_from_three_vec: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 3 +; CHECK-NEXT: addis r3, r2, .LCPI20_0@toc@ha ; CHECK-NEXT: xxsldwi vs1, vs36, vs36, 1 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: li r4, 8 -; CHECK-NEXT: stxsiwx vs35, r9, r3 -; CHECK-NEXT: stfiwx f0, 0, r9 -; CHECK-NEXT: stfiwx f1, r9, r4 +; CHECK-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: li r3, 8 +; CHECK-NEXT: stfiwx f1, r9, r3 +; CHECK-NEXT: xxswapd vs37, vs0 +; CHECK-NEXT: vperm v2, v3, v2, v5 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r9 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_elements_from_three_vec: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-BE-NEXT: xxsldwi vs1, vs35, vs35, 1 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: li r4, 8 -; CHECK-BE-NEXT: stxsiwx vs36, r9, r4 -; CHECK-BE-NEXT: stfiwx f1, r9, r3 -; CHECK-BE-NEXT: stfiwx f0, 0, r9 +; CHECK-BE-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-BE-NEXT: lxvw4x vs37, 0, r3 +; CHECK-BE-NEXT: li r3, 8 +; CHECK-BE-NEXT: stxsiwx vs36, r9, r3 +; CHECK-BE-NEXT: vperm v2, v2, v3, v5 +; CHECK-BE-NEXT: stxsdx vs34, 0, r9 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_elements_from_three_vec: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stxsiwx vs35, r9, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-P9-NEXT: lxv vs37, 0(r3) ; CHECK-P9-NEXT: li r3, 8 -; CHECK-P9-NEXT: stfiwx f0, 0, r9 +; CHECK-P9-NEXT: vperm v2, v3, v2, v5 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r9) ; CHECK-P9-NEXT: xxsldwi vs0, vs36, vs36, 1 ; CHECK-P9-NEXT: stfiwx f0, r9, r3 ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_elements_from_three_vec: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stfiwx f0, 0, r9 -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs35, vs35, 1 -; CHECK-P9-BE-NEXT: stfiwx f0, r9, r3 +; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-P9-BE-NEXT: lxv vs37, 0(r3) ; CHECK-P9-BE-NEXT: li r3, 8 ; CHECK-P9-BE-NEXT: stxsiwx vs36, r9, r3 +; CHECK-P9-BE-NEXT: vperm v2, v2, v3, v5 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r9) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x float> %a, i32 3