Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -448,6 +448,14 @@ return true; } + /// Return number of consecutive stores of vector elements that can be merged + /// before legalizing types. + virtual unsigned getNumStoresOfVectorElementsToMergePreLegalize( + LLVMContext &Context, const DataLayout &DL, EVT MemVT, unsigned AS, + unsigned Align, unsigned NumConsecutiveStores) const { + return std::min(1U, NumConsecutiveStores); + } + /// Return true if it is cheap to speculate a call to intrinsic cttz. virtual bool isCheapToSpeculateCttz() const { return false; Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15349,6 +15349,18 @@ NumStoresToMerge = i + 1; } + // In case the loop above found no merges and NumStoresToMerge is not + // changed. + if (NumStoresToMerge == 1 && Level == BeforeLegalizeTypes) { + // Some targets support shuffling of vector elements in type + // legalizing phase, so at BeforeLegalizeTypes level, a legal type for + // the vector store is not essential. Let target decide how many + // elements it can merge. + NumStoresToMerge = TLI.getNumStoresOfVectorElementsToMergePreLegalize( + Context, DL, MemVT.getScalarType(), FirstStoreAS, FirstStoreAlign, + NumConsecutiveStores); + } + // Check if we found a legal integer type creating a meaningful // merge. if (NumStoresToMerge < 2) { Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -852,6 +852,12 @@ unsigned Align = 1, bool *Fast = nullptr) const override; + /// For some consecutive stores of vector elements that can't fit in legal + /// vector type, merge is still allowed before type legalizing. + unsigned getNumStoresOfVectorElementsToMergePreLegalize( + LLVMContext &Context, const DataLayout &DL, EVT MemVT, unsigned AS, + unsigned Align, unsigned NumConsecutiveStores) const override; + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -14488,6 +14488,24 @@ return true; } +unsigned PPCTargetLowering::getNumStoresOfVectorElementsToMergePreLegalize( + LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AS, + unsigned Align, unsigned NumConsecutiveStores) const { + if (DisablePPCUnaligned) { + Type *Ty = VT.getTypeForEVT(Context); + if (Align < DL.getABITypeAlignment(Ty)) + return TargetLowering::getNumStoresOfVectorElementsToMergePreLegalize( + Context, DL, VT, AS, Align, NumConsecutiveStores); + } + + if (NumConsecutiveStores < 2 || !Subtarget.hasVSX() || !VT.isSimple()) + return TargetLowering::getNumStoresOfVectorElementsToMergePreLegalize( + Context, DL, VT, AS, Align, NumConsecutiveStores); + // PPC's vector has a size of 128 bits. + unsigned MaxNumberOfLegalStores = 128U / VT.getSizeInBits(); + return std::min(MaxNumberOfLegalStores, NumConsecutiveStores); +} + bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); Index: llvm/test/CodeGen/PowerPC/extract-and-store.ll =================================================================== --- llvm/test/CodeGen/PowerPC/extract-and-store.ll +++ llvm/test/CodeGen/PowerPC/extract-and-store.ll @@ -482,36 +482,30 @@ define dso_local void @test_consecutive_i32(<4 x i32> %a, i32* nocapture %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_consecutive_i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: stfiwx f0, 0, r5 -; CHECK-NEXT: stxsiwx vs34, r5, r3 +; CHECK-NEXT: vpkudum v2, v2, v2 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r5 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_consecutive_i32: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-BE-NEXT: xxsldwi vs1, vs34, vs34, 1 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: stfiwx f0, 0, r5 -; CHECK-BE-NEXT: stfiwx f1, r5, r3 +; CHECK-BE-NEXT: xxswapd vs35, vs34 +; CHECK-BE-NEXT: vmrghw v2, v2, v3 +; CHECK-BE-NEXT: stxsdx vs34, 0, r5 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_consecutive_i32: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stfiwx f0, 0, r5 -; CHECK-P9-NEXT: stxsiwx vs34, r5, r3 +; CHECK-P9-NEXT: vpkudum v2, v2, v2 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_consecutive_i32: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-BE-NEXT: stfiwx f0, 0, r5 -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-NEXT: xxswapd vs35, vs34 +; CHECK-P9-BE-NEXT: vmrghw v2, v2, v3 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r5) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 0 @@ -525,36 +519,34 @@ define dso_local void @test_consecutive_float(<4 x float> %a, float* nocapture %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_consecutive_float: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 3 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: stfiwx f0, 0, r5 -; CHECK-NEXT: stfiwx f1, r5, r3 +; CHECK-NEXT: addis r3, r2, .LCPI15_0@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI15_0@toc@l +; CHECK-NEXT: lvx v3, 0, r3 +; CHECK-NEXT: vperm v2, v2, v2, v3 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r5 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_consecutive_float: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: stxsiwx vs34, 0, r5 -; CHECK-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-BE-NEXT: vpkudum v2, v2, v2 +; CHECK-BE-NEXT: stxsdx vs34, 0, r5 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_consecutive_float: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-NEXT: stfiwx f0, 0, r5 -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI15_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI15_0@toc@l +; CHECK-P9-NEXT: lxvx vs35, 0, r3 +; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_consecutive_float: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stxsiwx vs34, 0, r5 -; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-NEXT: vpkudum v2, v2, v2 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r5) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x float> %a, i32 1 @@ -569,28 +561,24 @@ ; CHECK-LABEL: test_stores_exceed_vec_size: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 1 -; CHECK-NEXT: li r4, 20 ; CHECK-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-NEXT: lvx v3, 0, r3 -; CHECK-NEXT: li r3, 16 ; CHECK-NEXT: vperm v3, v2, v2, v3 +; CHECK-NEXT: vsldoi v2, v2, v2, 12 ; CHECK-NEXT: xxswapd vs0, vs35 +; CHECK-NEXT: xxswapd vs1, vs34 ; CHECK-NEXT: stxvd2x vs0, 0, r5 -; CHECK-NEXT: stfiwx f1, r5, r3 -; CHECK-NEXT: stxsiwx vs34, r5, r4 +; CHECK-NEXT: stfd f1, 16(r5) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: xxspltw vs0, vs34, 0 -; CHECK-BE-NEXT: xxsldwi vs1, vs34, vs34, 1 +; CHECK-BE-NEXT: vsldoi v3, v2, v2, 4 ; CHECK-BE-NEXT: li r3, 16 -; CHECK-BE-NEXT: li r4, 20 -; CHECK-BE-NEXT: stxsiwx vs34, r5, r3 ; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs0, 2 ; CHECK-BE-NEXT: stxvw4x vs0, 0, r5 -; CHECK-BE-NEXT: stfiwx f1, r5, r4 +; CHECK-BE-NEXT: stxsdx vs35, r5, r3 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_stores_exceed_vec_size: @@ -598,25 +586,20 @@ ; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha ; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-P9-NEXT: lxvx vs35, 0, r3 -; CHECK-P9-NEXT: li r3, 16 ; CHECK-P9-NEXT: vperm v3, v2, v2, v3 -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 +; CHECK-P9-NEXT: vsldoi v2, v2, v2, 12 +; CHECK-P9-NEXT: xxswapd vs0, vs34 ; CHECK-P9-NEXT: stxv vs35, 0(r5) -; CHECK-P9-NEXT: stfiwx f0, r5, r3 -; CHECK-P9-NEXT: li r3, 20 -; CHECK-P9-NEXT: stxsiwx vs34, r5, r3 +; CHECK-P9-NEXT: stfd f0, 16(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: xxspltw vs0, vs34, 0 ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2 -; CHECK-P9-BE-NEXT: li r3, 16 +; CHECK-P9-BE-NEXT: vsldoi v2, v2, v2, 4 ; CHECK-P9-BE-NEXT: stxv vs0, 0(r5) -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3 -; CHECK-P9-BE-NEXT: li r3, 20 -; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-NEXT: stxsd v2, 16(r5) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 2 @@ -640,71 +623,55 @@ define void @test_5_consecutive_stores_of_bytes(<16 x i8> %a, i8* nocapture %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_5_consecutive_stores_of_bytes: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addis r3, r2, .LCPI17_0@toc@ha ; CHECK-NEXT: xxswapd vs0, vs34 -; CHECK-NEXT: mfvsrd r3, vs34 -; CHECK-NEXT: rldicl r6, r3, 32, 56 -; CHECK-NEXT: rldicl r3, r3, 56, 56 -; CHECK-NEXT: mfvsrd r4, f0 -; CHECK-NEXT: stb r6, 1(r5) -; CHECK-NEXT: stb r3, 2(r5) -; CHECK-NEXT: rldicl r6, r4, 32, 56 -; CHECK-NEXT: rldicl r3, r4, 8, 56 -; CHECK-NEXT: rldicl r4, r4, 16, 56 -; CHECK-NEXT: stb r6, 0(r5) -; CHECK-NEXT: stb r3, 3(r5) -; CHECK-NEXT: stb r4, 4(r5) +; CHECK-NEXT: addi r3, r3, .LCPI17_0@toc@l +; CHECK-NEXT: lvx v3, 0, r3 +; CHECK-NEXT: mfvsrd r3, f0 +; CHECK-NEXT: vperm v3, v2, v2, v3 +; CHECK-NEXT: rldicl r3, r3, 16, 56 +; CHECK-NEXT: stb r3, 4(r5) +; CHECK-NEXT: xxsldwi vs1, vs35, vs35, 2 +; CHECK-NEXT: stfiwx f1, 0, r5 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_5_consecutive_stores_of_bytes: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxswapd vs0, vs34 +; CHECK-BE-NEXT: addis r3, r2, .LCPI17_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI17_0@toc@l +; CHECK-BE-NEXT: lxvw4x vs35, 0, r3 ; CHECK-BE-NEXT: mfvsrd r3, vs34 -; CHECK-BE-NEXT: rldicl r6, r3, 40, 56 -; CHECK-BE-NEXT: mfvsrd r4, f0 -; CHECK-BE-NEXT: stb r6, 0(r5) -; CHECK-BE-NEXT: rldicl r6, r4, 40, 56 -; CHECK-BE-NEXT: rldicl r4, r4, 16, 56 -; CHECK-BE-NEXT: stb r6, 1(r5) -; CHECK-BE-NEXT: clrldi r6, r3, 56 ; CHECK-BE-NEXT: rldicl r3, r3, 56, 56 -; CHECK-BE-NEXT: stb r4, 2(r5) -; CHECK-BE-NEXT: stb r6, 3(r5) +; CHECK-BE-NEXT: vperm v3, v2, v2, v3 ; CHECK-BE-NEXT: stb r3, 4(r5) +; CHECK-BE-NEXT: xxsldwi vs0, vs35, vs35, 3 +; CHECK-BE-NEXT: stfiwx f0, 0, r5 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_5_consecutive_stores_of_bytes: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4 -; CHECK-P9-NEXT: stxsibx vs35, 0, r5 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 12 -; CHECK-P9-NEXT: li r3, 1 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 15 -; CHECK-P9-NEXT: li r3, 2 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 1 -; CHECK-P9-NEXT: li r3, 3 -; CHECK-P9-NEXT: vsldoi v2, v2, v2, 2 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 +; CHECK-P9-NEXT: vsldoi v3, v2, v2, 2 ; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stxsibx vs34, r5, r3 +; CHECK-P9-NEXT: stxsibx vs35, r5, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI17_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI17_0@toc@l +; CHECK-P9-NEXT: lxvx vs35, 0, r3 +; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 2 +; CHECK-P9-NEXT: stfiwx f0, 0, r5 ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_5_consecutive_stores_of_bytes: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13 -; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 5 -; CHECK-P9-BE-NEXT: li r3, 1 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 2 -; CHECK-P9-BE-NEXT: li r3, 2 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: li r3, 3 -; CHECK-P9-BE-NEXT: stxsibx vs34, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v2, v2, v2, 15 +; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 15 ; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stxsibx vs34, r5, r3 +; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 +; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI17_0@toc@ha +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI17_0@toc@l +; CHECK-P9-BE-NEXT: lxvx vs35, 0, r3 +; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 +; CHECK-P9-BE-NEXT: stfiwx f0, 0, r5 ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <16 x i8> %a, i32 4 @@ -727,151 +694,63 @@ define void @test_13_consecutive_stores_of_bytes(<16 x i8> %a, i8* nocapture %b) local_unnamed_addr #0 { ; CHECK-LABEL: test_13_consecutive_stores_of_bytes: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: addis r3, r2, .LCPI18_0@toc@ha +; CHECK-NEXT: li r4, 8 +; CHECK-NEXT: addi r3, r3, .LCPI18_0@toc@l +; CHECK-NEXT: lvx v3, 0, r3 ; CHECK-NEXT: mfvsrd r3, vs34 -; CHECK-NEXT: rldicl r4, r3, 32, 56 -; CHECK-NEXT: rldicl r6, r3, 56, 56 -; CHECK-NEXT: stb r4, 1(r5) -; CHECK-NEXT: rldicl r4, r3, 40, 56 -; CHECK-NEXT: mfvsrd r7, f0 -; CHECK-NEXT: stb r6, 2(r5) -; CHECK-NEXT: rldicl r6, r3, 24, 56 -; CHECK-NEXT: stb r4, 6(r5) -; CHECK-NEXT: rldicl r4, r3, 8, 56 -; CHECK-NEXT: stb r6, 7(r5) ; CHECK-NEXT: rldicl r3, r3, 16, 56 -; CHECK-NEXT: stb r4, 9(r5) -; CHECK-NEXT: rldicl r4, r7, 32, 56 -; CHECK-NEXT: rldicl r6, r7, 8, 56 -; CHECK-NEXT: stb r4, 0(r5) -; CHECK-NEXT: rldicl r4, r7, 16, 56 -; CHECK-NEXT: stb r6, 3(r5) -; CHECK-NEXT: clrldi r6, r7, 56 -; CHECK-NEXT: stb r4, 4(r5) -; CHECK-NEXT: rldicl r4, r7, 48, 56 -; CHECK-NEXT: stb r6, 5(r5) -; CHECK-NEXT: rldicl r6, r7, 56, 56 -; CHECK-NEXT: stb r4, 8(r5) -; CHECK-NEXT: rldicl r4, r7, 24, 56 -; CHECK-NEXT: stb r6, 10(r5) -; CHECK-NEXT: stb r4, 11(r5) +; CHECK-NEXT: vperm v3, v2, v2, v3 +; CHECK-NEXT: xxswapd vs0, vs35 +; CHECK-NEXT: stxsiwx vs35, r5, r4 ; CHECK-NEXT: stb r3, 12(r5) +; CHECK-NEXT: stfdx f0, 0, r5 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_13_consecutive_stores_of_bytes: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: mfvsrd r3, vs34 +; CHECK-BE-NEXT: addis r3, r2, .LCPI18_0@toc@ha ; CHECK-BE-NEXT: xxswapd vs0, vs34 -; CHECK-BE-NEXT: rldicl r4, r3, 40, 56 -; CHECK-BE-NEXT: clrldi r6, r3, 56 -; CHECK-BE-NEXT: stb r4, 0(r5) -; CHECK-BE-NEXT: rldicl r4, r3, 56, 56 -; CHECK-BE-NEXT: mfvsrd r7, f0 -; CHECK-BE-NEXT: stb r6, 3(r5) -; CHECK-BE-NEXT: rldicl r6, r3, 8, 56 -; CHECK-BE-NEXT: stb r4, 4(r5) -; CHECK-BE-NEXT: rldicl r4, r3, 24, 56 -; CHECK-BE-NEXT: stb r6, 5(r5) -; CHECK-BE-NEXT: rldicl r6, r3, 16, 56 -; CHECK-BE-NEXT: stb r4, 8(r5) -; CHECK-BE-NEXT: rldicl r4, r7, 40, 56 -; CHECK-BE-NEXT: stb r6, 10(r5) -; CHECK-BE-NEXT: rldicl r6, r7, 16, 56 -; CHECK-BE-NEXT: stb r4, 1(r5) -; CHECK-BE-NEXT: rldicl r4, r7, 32, 56 -; CHECK-BE-NEXT: stb r6, 2(r5) -; CHECK-BE-NEXT: rldicl r6, r7, 48, 56 -; CHECK-BE-NEXT: stb r4, 6(r5) -; CHECK-BE-NEXT: clrldi r4, r7, 56 -; CHECK-BE-NEXT: stb r6, 7(r5) -; CHECK-BE-NEXT: rldicl r3, r3, 48, 56 -; CHECK-BE-NEXT: rldicl r6, r7, 56, 56 -; CHECK-BE-NEXT: stb r4, 9(r5) -; CHECK-BE-NEXT: stb r3, 11(r5) -; CHECK-BE-NEXT: stb r6, 12(r5) +; CHECK-BE-NEXT: li r4, 8 +; CHECK-BE-NEXT: addi r3, r3, .LCPI18_0@toc@l +; CHECK-BE-NEXT: lxvw4x vs35, 0, r3 +; CHECK-BE-NEXT: mfvsrd r3, f0 +; CHECK-BE-NEXT: vperm v3, v2, v2, v3 +; CHECK-BE-NEXT: rldicl r3, r3, 56, 56 +; CHECK-BE-NEXT: stb r3, 12(r5) +; CHECK-BE-NEXT: xxsldwi vs1, vs35, vs35, 1 +; CHECK-BE-NEXT: stxsdx vs35, 0, r5 +; CHECK-BE-NEXT: stfiwx f1, r5, r4 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_13_consecutive_stores_of_bytes: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4 -; CHECK-P9-NEXT: stxsibx vs35, 0, r5 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 12 -; CHECK-P9-NEXT: li r3, 1 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 15 -; CHECK-P9-NEXT: li r3, 2 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 1 -; CHECK-P9-NEXT: li r3, 3 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 2 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 8 -; CHECK-P9-NEXT: li r3, 5 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 13 -; CHECK-P9-NEXT: li r3, 6 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 11 -; CHECK-P9-NEXT: li r3, 7 +; CHECK-P9-NEXT: vsldoi v3, v2, v2, 10 +; CHECK-P9-NEXT: li r3, 12 ; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 6 +; CHECK-P9-NEXT: addis r3, r2, .LCPI18_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI18_0@toc@l +; CHECK-P9-NEXT: lxvx vs35, 0, r3 ; CHECK-P9-NEXT: li r3, 8 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 9 -; CHECK-P9-NEXT: li r3, 9 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 7 -; CHECK-P9-NEXT: li r3, 10 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: vsldoi v3, v2, v2, 3 -; CHECK-P9-NEXT: li r3, 11 -; CHECK-P9-NEXT: vsldoi v2, v2, v2, 10 -; CHECK-P9-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-NEXT: li r3, 12 -; CHECK-P9-NEXT: stxsibx vs34, r5, r3 +; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stxsiwx vs34, r5, r3 +; CHECK-P9-NEXT: stfd f0, 0(r5) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_13_consecutive_stores_of_bytes: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13 -; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 5 -; CHECK-P9-BE-NEXT: li r3, 1 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 2 -; CHECK-P9-BE-NEXT: li r3, 2 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: li r3, 3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 15 -; CHECK-P9-BE-NEXT: stxsibx vs34, r5, r3 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 9 -; CHECK-P9-BE-NEXT: li r3, 5 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 4 -; CHECK-P9-BE-NEXT: li r3, 6 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 6 -; CHECK-P9-BE-NEXT: li r3, 7 +; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 7 +; CHECK-P9-BE-NEXT: li r3, 12 ; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 11 +; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI18_0@toc@ha +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI18_0@toc@l +; CHECK-P9-BE-NEXT: lxvx vs35, 0, r3 ; CHECK-P9-BE-NEXT: li r3, 8 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 8 -; CHECK-P9-BE-NEXT: li r3, 9 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 10 -; CHECK-P9-BE-NEXT: li r3, 10 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 14 -; CHECK-P9-BE-NEXT: li r3, 11 -; CHECK-P9-BE-NEXT: vsldoi v2, v2, v2, 7 -; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 -; CHECK-P9-BE-NEXT: li r3, 12 -; CHECK-P9-BE-NEXT: stxsibx vs34, r5, r3 +; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 +; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r5) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <16 x i8> %a, i32 4 @@ -918,36 +797,36 @@ define void @test_elements_from_two_vec(<4 x i32> %a, <4 x i32> %b, i32* nocapture %c) local_unnamed_addr #0 { ; CHECK-LABEL: test_elements_from_two_vec: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-NEXT: xxsldwi vs1, vs35, vs35, 1 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: stfiwx f0, r7, r3 -; CHECK-NEXT: stfiwx f1, 0, r7 +; CHECK-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-NEXT: lvx v4, 0, r3 +; CHECK-NEXT: vperm v2, v2, v3, v4 +; CHECK-NEXT: xxswapd vs0, vs34 +; CHECK-NEXT: stfdx f0, 0, r7 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_elements_from_two_vec: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: stfiwx f0, r7, r3 -; CHECK-BE-NEXT: stxsiwx vs35, 0, r7 +; CHECK-BE-NEXT: vmrghw v3, v3, v3 +; CHECK-BE-NEXT: xxsldwi vs0, vs35, vs34, 3 +; CHECK-BE-NEXT: stfdx f0, 0, r7 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_elements_from_two_vec: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stfiwx f0, r7, r3 -; CHECK-P9-NEXT: xxsldwi vs0, vs35, vs35, 1 -; CHECK-P9-NEXT: stfiwx f0, 0, r7 +; CHECK-P9-NEXT: addis r3, r2, .LCPI19_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI19_0@toc@l +; CHECK-P9-NEXT: lxvx vs36, 0, r3 +; CHECK-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r7) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_elements_from_two_vec: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3 -; CHECK-P9-BE-NEXT: stxsiwx vs35, 0, r7 +; CHECK-P9-BE-NEXT: vmrghw v3, v3, v3 +; CHECK-P9-BE-NEXT: xxsldwi vs0, vs35, vs34, 3 +; CHECK-P9-BE-NEXT: stfd f0, 0(r7) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 0 @@ -961,46 +840,46 @@ define dso_local void @test_elements_from_three_vec(<4 x float> %a, <4 x float> %b, <4 x float> %c, float* nocapture %d) local_unnamed_addr #0 { ; CHECK-LABEL: test_elements_from_three_vec: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-NEXT: xxsldwi vs1, vs36, vs36, 1 -; CHECK-NEXT: li r3, 4 -; CHECK-NEXT: li r4, 8 -; CHECK-NEXT: stxsiwx vs35, r9, r3 -; CHECK-NEXT: stfiwx f0, 0, r9 -; CHECK-NEXT: stfiwx f1, r9, r4 +; CHECK-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-NEXT: xxsldwi vs0, vs36, vs36, 1 +; CHECK-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-NEXT: lvx v5, 0, r3 +; CHECK-NEXT: li r3, 8 +; CHECK-NEXT: stfiwx f0, r9, r3 +; CHECK-NEXT: vperm v2, v3, v2, v5 +; CHECK-NEXT: xxswapd vs1, vs34 +; CHECK-NEXT: stfdx f1, 0, r9 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test_elements_from_three_vec: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-BE-NEXT: xxsldwi vs1, vs35, vs35, 1 -; CHECK-BE-NEXT: li r3, 4 -; CHECK-BE-NEXT: li r4, 8 -; CHECK-BE-NEXT: stfiwx f1, r9, r3 -; CHECK-BE-NEXT: stfiwx f0, 0, r9 -; CHECK-BE-NEXT: stxsiwx vs36, r9, r4 +; CHECK-BE-NEXT: xxsldwi vs34, vs34, vs34, 1 +; CHECK-BE-NEXT: li r3, 8 +; CHECK-BE-NEXT: stxsiwx vs36, r9, r3 +; CHECK-BE-NEXT: vmrglw v2, v2, v3 +; CHECK-BE-NEXT: stxsdx vs34, 0, r9 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: test_elements_from_three_vec: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-NEXT: li r3, 4 -; CHECK-P9-NEXT: stfiwx f0, 0, r9 ; CHECK-P9-NEXT: xxsldwi vs0, vs36, vs36, 1 -; CHECK-P9-NEXT: stxsiwx vs35, r9, r3 ; CHECK-P9-NEXT: li r3, 8 ; CHECK-P9-NEXT: stfiwx f0, r9, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-P9-NEXT: lxvx vs36, 0, r3 +; CHECK-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-P9-NEXT: xxswapd vs0, vs34 +; CHECK-P9-NEXT: stfd f0, 0(r9) ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: test_elements_from_three_vec: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 -; CHECK-P9-BE-NEXT: stfiwx f0, 0, r9 -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs35, vs35, 1 -; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stfiwx f0, r9, r3 +; CHECK-P9-BE-NEXT: xxsldwi vs34, vs34, vs34, 1 +; CHECK-P9-BE-NEXT: vmrglw v2, v2, v3 ; CHECK-P9-BE-NEXT: li r3, 8 ; CHECK-P9-BE-NEXT: stxsiwx vs36, r9, r3 +; CHECK-P9-BE-NEXT: stxsd v2, 0(r9) ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x float> %a, i32 3