diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18444,6 +18444,17 @@ *FirstInChain->getMemOperand(), &IsFast) && IsFast) NumStoresToMerge = i + 1; + else if (Level == BeforeLegalizeTypes) { + EVT DoubleTy = Ty.getDoubleNumVectorElementsVT(*DAG.getContext()); + if (TLI.isTypeLegal(DoubleTy) && + TLI.getTypeToTransformTo(*DAG.getContext(), Ty) == DoubleTy && + TLI.canMergeStoresTo(FirstStoreAS, DoubleTy, + DAG.getMachineFunction()) && + TLI.allowsMemoryAccess(Context, DL, DoubleTy, + *FirstInChain->getMemOperand(), &IsFast) && + IsFast) + NumStoresToMerge = i + 1; + } } // Check if we found a legal integer type creating a meaningful diff --git a/llvm/test/CodeGen/PowerPC/extract-and-store.ll b/llvm/test/CodeGen/PowerPC/extract-and-store.ll --- a/llvm/test/CodeGen/PowerPC/extract-and-store.ll +++ b/llvm/test/CodeGen/PowerPC/extract-and-store.ll @@ -668,54 +668,48 @@ define dso_local void @test_consecutive_i32(<4 x i32> %a, i32* nocapture %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_consecutive_i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: stxsiwx vs34, r5, r3 -; CHECK-NEXT: stfiwx f0, 0, r5 +; CHECK-NEXT: vpkudum v2, v2, v2 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r5 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_consecutive_i32: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-BE-NEXT: xxsldwi vs1, vs34, vs34, 1 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: stfiwx f0, 0, r5 -; CHECK-BE-NEXT: stfiwx f1, r5, r3 +; CHECK-BE-NEXT: addis r3, r2, .LCPI14_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI14_0@toc@l +; CHECK-BE-NEXT: lxvw4x vs35, 0, r3 +; CHECK-BE-NEXT: vperm v2, v2, v2, v3 +; CHECK-BE-NEXT: stxsdx vs34, 0, r5 ; CHECK-BE-NEXT: blr ; ; CHECK-BE-PERFSHUFFLE-LABEL: test_consecutive_i32: ; CHECK-BE-PERFSHUFFLE: # %bb.0: # %entry -; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs1, vs34, vs34, 1 -; CHECK-BE-PERFSHUFFLE-NEXT: li r3, 4 -; CHECK-BE-PERFSHUFFLE-NEXT: stfiwx f0, 0, r5 -; CHECK-BE-PERFSHUFFLE-NEXT: stfiwx f1, r5, r3 +; CHECK-BE-PERFSHUFFLE-NEXT: xxswapd vs0, vs34 +; CHECK-BE-PERFSHUFFLE-NEXT: xxmrghw vs0, vs34, vs0 +; CHECK-BE-PERFSHUFFLE-NEXT: stfdx f0, 0, r5 ; CHECK-BE-PERFSHUFFLE-NEXT: blr ; ; CHECK-P9-LABEL: test_consecutive_i32: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stxsiwx vs34, r5, r3 -; CHECK-P9-NEXT: stfiwx f0, 0, r5 +; CHECK-P9-NEXT: vpkudum v2, v2, v2 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_consecutive_i32: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stfiwx f0, 0, r5 -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI14_0@toc@ha +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI14_0@toc@l +; CHECK-P9-BE-NEXT: lxv vs35, 0(r3) +; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r5) ; CHECK-P9-BE-NEXT: blr ; ; CHECK-P9-BE-PERFSHUFFLE-LABEL: test_consecutive_i32: ; CHECK-P9-BE-PERFSHUFFLE: # %bb.0: # %entry -; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: li r3, 4 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfiwx f0, 0, r5 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxswapd vs0, vs34 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxmrghw vs0, vs34, vs0 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfd f0, 0(r5) ; CHECK-P9-BE-PERFSHUFFLE-NEXT: blr entry: @@ -730,52 +724,47 @@ define dso_local void @test_consecutive_float(<4 x float> %a, float* nocapture %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_consecutive_float: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 3 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: stfiwx f0, 0, r5 -; CHECK-NEXT: stfiwx f1, r5, r3 +; CHECK-NEXT: addis r3, r2, .LCPI15_0@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI15_0@toc@l +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: xxswapd vs35, vs0 +; CHECK-NEXT: vperm v2, v2, v2, v3 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r5 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_consecutive_float: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: stxsiwx vs34, 0, r5 -; CHECK-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-BE-NEXT: vpkudum v2, v2, v2 +; CHECK-BE-NEXT: stxsdx vs34, 0, r5 ; CHECK-BE-NEXT: blr ; ; CHECK-BE-PERFSHUFFLE-LABEL: test_consecutive_float: ; CHECK-BE-PERFSHUFFLE: # %bb.0: # %entry -; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-BE-PERFSHUFFLE-NEXT: li r3, 4 -; CHECK-BE-PERFSHUFFLE-NEXT: stxsiwx vs34, 0, r5 -; CHECK-BE-PERFSHUFFLE-NEXT: stfiwx f0, r5, r3 +; CHECK-BE-PERFSHUFFLE-NEXT: vpkudum v2, v2, v2 +; CHECK-BE-PERFSHUFFLE-NEXT: stxsdx vs34, 0, r5 ; CHECK-BE-PERFSHUFFLE-NEXT: blr ; ; CHECK-P9-LABEL: test_consecutive_float: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stfiwx f0, 0, r5 -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI15_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI15_0@toc@l +; CHECK-P9-NEXT: lxv vs35, 0(r3) +; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_consecutive_float: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stxsiwx vs34, 0, r5 -; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-NEXT: vpkudum v2, v2, v2 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r5) ; CHECK-P9-BE-NEXT: blr ; ; CHECK-P9-BE-PERFSHUFFLE-LABEL: test_consecutive_float: ; CHECK-P9-BE-PERFSHUFFLE: # %bb.0: # %entry -; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: li r3, 4 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stxsiwx vs34, 0, r5 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: vpkudum v2, v2, v2 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: stxsd v2, 0(r5) ; CHECK-P9-BE-PERFSHUFFLE-NEXT: blr entry: %vecext = extractelement <4 x float> %a, i32 1 @@ -790,83 +779,69 @@ ; CHECK-LABEL: test_stores_exceed_vec_size: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 1 -; CHECK-NEXT: li r4, 20 ; CHECK-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-NEXT: lxvd2x vs0, 0, r3 -; CHECK-NEXT: li r3, 16 ; CHECK-NEXT: xxswapd vs35, vs0 ; CHECK-NEXT: vperm v3, v2, v2, v3 +; CHECK-NEXT: vsldoi v2, v2, v2, 12 ; CHECK-NEXT: xxswapd vs0, vs35 +; CHECK-NEXT: xxswapd vs1, vs34 ; CHECK-NEXT: stxvd2x vs0, 0, r5 -; CHECK-NEXT: stfiwx f1, r5, r3 -; CHECK-NEXT: stxsiwx vs34, r5, r4 +; CHECK-NEXT: stfd f1, 16(r5) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-BE-NEXT: li r4, 20 ; CHECK-BE-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-BE-NEXT: lxvw4x vs35, 0, r3 ; CHECK-BE-NEXT: li r3, 16 -; CHECK-BE-NEXT: stxsiwx vs34, r5, r3 -; CHECK-BE-NEXT: stfiwx f0, r5, r4 ; CHECK-BE-NEXT: vperm v3, v2, v2, v3 +; CHECK-BE-NEXT: vsldoi v2, v2, v2, 4 ; CHECK-BE-NEXT: stxvw4x vs35, 0, r5 +; CHECK-BE-NEXT: stxsdx vs34, r5, r3 ; CHECK-BE-NEXT: blr ; ; CHECK-BE-PERFSHUFFLE-LABEL: test_stores_exceed_vec_size: ; CHECK-BE-PERFSHUFFLE: # %bb.0: # %entry ; CHECK-BE-PERFSHUFFLE-NEXT: xxspltw vs0, vs34, 0 -; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs1, vs34, vs34, 1 +; CHECK-BE-PERFSHUFFLE-NEXT: vsldoi v3, v2, v2, 4 ; CHECK-BE-PERFSHUFFLE-NEXT: li r3, 16 -; CHECK-BE-PERFSHUFFLE-NEXT: li r4, 20 -; CHECK-BE-PERFSHUFFLE-NEXT: stxsiwx vs34, r5, r3 ; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs0, 2 -; CHECK-BE-PERFSHUFFLE-NEXT: stfiwx f1, r5, r4 ; CHECK-BE-PERFSHUFFLE-NEXT: stxvw4x vs0, 0, r5 +; CHECK-BE-PERFSHUFFLE-NEXT: stxsdx vs35, r5, r3 ; CHECK-BE-PERFSHUFFLE-NEXT: blr ; ; CHECK-P9-LABEL: test_stores_exceed_vec_size: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-P9-NEXT: lxv vs35, 0(r3) -; CHECK-P9-NEXT: li r3, 16 -; CHECK-P9-NEXT: stfiwx f0, r5, r3 -; CHECK-P9-NEXT: li r3, 20 -; CHECK-P9-NEXT: stxsiwx vs34, r5, r3 ; CHECK-P9-NEXT: vperm v3, v2, v2, v3 +; CHECK-P9-NEXT: vsldoi v2, v2, v2, 12 +; CHECK-P9-NEXT: xxswapd vs0, vs34 ; CHECK-P9-NEXT: stxv vs35, 0(r5) +; CHECK-P9-NEXT: stfd f0, 16(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-P9-BE-NEXT: lxv vs35, 0(r3) -; CHECK-P9-BE-NEXT: li r3, 16 -; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3 -; CHECK-P9-BE-NEXT: li r3, 20 -; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 ; CHECK-P9-BE-NEXT: vperm v3, v2, v2, v3 +; CHECK-P9-BE-NEXT: vsldoi v2, v2, v2, 4 ; CHECK-P9-BE-NEXT: stxv vs35, 0(r5) +; CHECK-P9-BE-NEXT: stxsd v2, 16(r5) ; CHECK-P9-BE-NEXT: blr ; ; CHECK-P9-BE-PERFSHUFFLE-LABEL: test_stores_exceed_vec_size: ; CHECK-P9-BE-PERFSHUFFLE: # %bb.0: # %entry ; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxspltw vs0, vs34, 0 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: li r3, 16 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stxsiwx vs34, r5, r3 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: li r3, 20 ; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs0, 2 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: vsldoi v2, v2, v2, 4 ; CHECK-P9-BE-PERFSHUFFLE-NEXT: stxv vs0, 0(r5) -; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: stxsd v2, 16(r5) ; CHECK-P9-BE-PERFSHUFFLE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 2 @@ -1276,52 +1251,55 @@ define void @test_elements_from_two_vec(<4 x i32> %a, <4 x i32> %b, i32* nocapture %c) local_unnamed_addr #0 { ; CHECK-LABEL: test_elements_from_two_vec: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-NEXT: xxsldwi vs1, vs35, vs35, 1 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: stfiwx f0, r7, r3 -; CHECK-NEXT: stfiwx f1, 0, r7 +; CHECK-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: xxswapd vs36, vs0 +; CHECK-NEXT: vperm v2, v2, v3, v4 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r7 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_elements_from_two_vec: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: stxsiwx vs35, 0, r7 -; CHECK-BE-NEXT: stfiwx f0, r7, r3 +; CHECK-BE-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-BE-NEXT: lxvw4x vs36, 0, r3 +; CHECK-BE-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-NEXT: stxsdx vs34, 0, r7 ; CHECK-BE-NEXT: blr ; ; CHECK-BE-PERFSHUFFLE-LABEL: test_elements_from_two_vec: ; CHECK-BE-PERFSHUFFLE: # %bb.0: # %entry -; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-BE-PERFSHUFFLE-NEXT: li r3, 4 -; CHECK-BE-PERFSHUFFLE-NEXT: stxsiwx vs35, 0, r7 -; CHECK-BE-PERFSHUFFLE-NEXT: stfiwx f0, r7, r3 +; CHECK-BE-PERFSHUFFLE-NEXT: xxmrghw vs35, vs35, vs35 +; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs35, vs34, 3 +; CHECK-BE-PERFSHUFFLE-NEXT: stfdx f0, 0, r7 ; CHECK-BE-PERFSHUFFLE-NEXT: blr ; ; CHECK-P9-LABEL: test_elements_from_two_vec: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stfiwx f0, r7, r3 -; CHECK-P9-NEXT: xxsldwi vs0, vs35, vs35, 1 -; CHECK-P9-NEXT: stfiwx f0, 0, r7 +; CHECK-P9-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-P9-NEXT: lxv vs36, 0(r3) +; CHECK-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r7) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_elements_from_two_vec: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stxsiwx vs35, 0, r7 -; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3 +; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-P9-BE-NEXT: lxv vs36, 0(r3) +; CHECK-P9-BE-NEXT: vperm v2, v3, v2, v4 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r7) ; CHECK-P9-BE-NEXT: blr ; ; CHECK-P9-BE-PERFSHUFFLE-LABEL: test_elements_from_two_vec: ; CHECK-P9-BE-PERFSHUFFLE: # %bb.0: # %entry -; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: li r3, 4 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stxsiwx vs35, 0, r7 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfiwx f0, r7, r3 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxmrghw vs35, vs35, vs35 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs35, vs34, 3 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfd f0, 0(r7) ; CHECK-P9-BE-PERFSHUFFLE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 0 @@ -1335,68 +1313,69 @@ define dso_local void @test_elements_from_three_vec(<4 x float> %a, <4 x float> %b, <4 x float> %c, float* nocapture %d) local_unnamed_addr #0 { ; CHECK-LABEL: test_elements_from_three_vec: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 3 +; CHECK-NEXT: addis r3, r2, .LCPI20_0@toc@ha ; CHECK-NEXT: xxsldwi vs1, vs36, vs36, 1 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: li r4, 8 -; CHECK-NEXT: stxsiwx vs35, r9, r3 -; CHECK-NEXT: stfiwx f0, 0, r9 -; CHECK-NEXT: stfiwx f1, r9, r4 +; CHECK-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-NEXT: lxvd2x vs0, 0, r3 +; CHECK-NEXT: li r3, 8 +; CHECK-NEXT: stfiwx f1, r9, r3 +; CHECK-NEXT: xxswapd vs37, vs0 +; CHECK-NEXT: vperm v2, v3, v2, v5 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r9 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_elements_from_three_vec: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-BE-NEXT: xxsldwi vs1, vs35, vs35, 1 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: li r4, 8 -; CHECK-BE-NEXT: stxsiwx vs36, r9, r4 -; CHECK-BE-NEXT: stfiwx f1, r9, r3 -; CHECK-BE-NEXT: stfiwx f0, 0, r9 +; CHECK-BE-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-BE-NEXT: lxvw4x vs37, 0, r3 +; CHECK-BE-NEXT: li r3, 8 +; CHECK-BE-NEXT: stxsiwx vs36, r9, r3 +; CHECK-BE-NEXT: vperm v2, v2, v3, v5 +; CHECK-BE-NEXT: stxsdx vs34, 0, r9 ; CHECK-BE-NEXT: blr ; ; CHECK-BE-PERFSHUFFLE-LABEL: test_elements_from_three_vec: ; CHECK-BE-PERFSHUFFLE: # %bb.0: # %entry -; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs1, vs35, vs35, 1 -; CHECK-BE-PERFSHUFFLE-NEXT: li r3, 4 -; CHECK-BE-PERFSHUFFLE-NEXT: li r4, 8 -; CHECK-BE-PERFSHUFFLE-NEXT: stxsiwx vs36, r9, r4 -; CHECK-BE-PERFSHUFFLE-NEXT: stfiwx f1, r9, r3 -; CHECK-BE-PERFSHUFFLE-NEXT: stfiwx f0, 0, r9 +; CHECK-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 1 +; CHECK-BE-PERFSHUFFLE-NEXT: li r3, 8 +; CHECK-BE-PERFSHUFFLE-NEXT: stxsiwx vs36, r9, r3 +; CHECK-BE-PERFSHUFFLE-NEXT: xxmrglw vs0, vs0, vs35 +; CHECK-BE-PERFSHUFFLE-NEXT: stfdx f0, 0, r9 ; CHECK-BE-PERFSHUFFLE-NEXT: blr ; ; CHECK-P9-LABEL: test_elements_from_three_vec: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stxsiwx vs35, r9, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-P9-NEXT: lxv vs37, 0(r3) ; CHECK-P9-NEXT: li r3, 8 -; CHECK-P9-NEXT: stfiwx f0, 0, r9 +; CHECK-P9-NEXT: vperm v2, v3, v2, v5 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r9) ; CHECK-P9-NEXT: xxsldwi vs0, vs36, vs36, 1 ; CHECK-P9-NEXT: stfiwx f0, r9, r3 ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_elements_from_three_vec: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stfiwx f0, 0, r9 -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs35, vs35, 1 -; CHECK-P9-BE-NEXT: stfiwx f0, r9, r3 +; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-P9-BE-NEXT: lxv vs37, 0(r3) ; CHECK-P9-BE-NEXT: li r3, 8 ; CHECK-P9-BE-NEXT: stxsiwx vs36, r9, r3 +; CHECK-P9-BE-NEXT: vperm v2, v2, v3, v5 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r9) ; CHECK-P9-BE-NEXT: blr ; ; CHECK-P9-BE-PERFSHUFFLE-LABEL: test_elements_from_three_vec: ; CHECK-P9-BE-PERFSHUFFLE: # %bb.0: # %entry -; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: li r3, 4 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfiwx f0, 0, r9 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs35, vs35, 1 -; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfiwx f0, r9, r3 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-BE-PERFSHUFFLE-NEXT: li r3, 8 ; CHECK-P9-BE-PERFSHUFFLE-NEXT: stxsiwx vs36, r9, r3 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: xxmrglw vs0, vs0, vs35 +; CHECK-P9-BE-PERFSHUFFLE-NEXT: stfd f0, 0(r9) ; CHECK-P9-BE-PERFSHUFFLE-NEXT: blr entry: %vecext = extractelement <4 x float> %a, i32 3 diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -1204,35 +1204,23 @@ ; SSE42-NEXT: movups %xmm1, 16(%rdi) ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; SSE42-NEXT: movups %xmm2, 32(%rdi) -; SSE42-NEXT: extractps $1, %xmm3, 48(%rdi) -; SSE42-NEXT: extractps $2, %xmm3, 52(%rdi) +; SSE42-NEXT: movaps %xmm3, %xmm0 +; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] +; SSE42-NEXT: movlps %xmm0, 48(%rdi) ; SSE42-NEXT: extractps $3, %xmm3, 56(%rdi) ; SSE42-NEXT: retq ; -; AVX1-LABEL: compressstore_v16f32_const: -; AVX1: ## %bb.0: -; AVX1-NEXT: vmovups %ymm0, (%rdi) -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vmovups %xmm1, 32(%rdi) -; AVX1-NEXT: vextractps $1, %xmm0, 48(%rdi) -; AVX1-NEXT: vextractps $2, %xmm0, 52(%rdi) -; AVX1-NEXT: vextractps $3, %xmm0, 56(%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: compressstore_v16f32_const: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmovups %ymm0, (%rdi) -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,4] -; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovups %xmm0, 32(%rdi) -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vextractps $1, %xmm0, 48(%rdi) -; AVX2-NEXT: vextractps $2, %xmm0, 52(%rdi) -; AVX2-NEXT: vextractps $3, %xmm0, 56(%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: compressstore_v16f32_const: +; AVX1OR2: ## %bb.0: +; AVX1OR2-NEXT: vmovups %ymm0, (%rdi) +; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; AVX1OR2-NEXT: vmovups %xmm1, 32(%rdi) +; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,2,2,3] +; AVX1OR2-NEXT: vmovlps %xmm1, 48(%rdi) +; AVX1OR2-NEXT: vextractps $3, %xmm0, 56(%rdi) +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: compressstore_v16f32_const: ; AVX512F: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/pr40994.ll b/llvm/test/CodeGen/X86/pr40994.ll --- a/llvm/test/CodeGen/X86/pr40994.ll +++ b/llvm/test/CodeGen/X86/pr40994.ll @@ -4,15 +4,8 @@ define <8 x i8> @foo(<16 x i8> %a) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: pextrb $0, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $2, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $4, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $6, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $8, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $10, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $12, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $14, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; CHECK-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: retq %v = alloca i8, i32 8, align 16 call void @llvm.masked.compressstore.v16i8(<16 x i8> %a, ptr %v, <16 x i1> )