diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19082,19 +19082,54 @@ for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) { int Left = 2 * In; int Right = 2 * In + 1; + // Here we are creating a vector_shuffle operation between Left and Right, + // Try to optimize the case where Left or Right (or both) is a + // vector_shuffle operation on one operand only. + // For example, given: + // t0: v8i32 = vector_shuffle<0,5,u,u,u,u,u,u> t2, undef + // t1: v8i32 = vector_shuffle t3, t4 + // create the following vector_shuffle: + // v8i32 = vector_shuffle<0,5,10,11,u,u,u,u> t2, t1 + bool IsOneVecShuffleLeft = + Shuffles[Left].getOpcode() == ISD::VECTOR_SHUFFLE && + Shuffles[Left].getOperand(1).isUndef(); + bool IsOneVecShuffleRight = + Shuffles[Right].getOpcode() == ISD::VECTOR_SHUFFLE && + Shuffles[Right].getOperand(1).isUndef(); + SmallVector Mask(NumElems, -1); + SDValue ToShuffleLeft = Shuffles[Left]; + SDValue ToShuffleRight = Shuffles[Right]; + + if (IsOneVecShuffleLeft) + ToShuffleLeft = ToShuffleLeft.getOperand(0); + + if (IsOneVecShuffleRight) + ToShuffleRight = ToShuffleRight.getOperand(0); + for (unsigned i = 0; i != NumElems; ++i) { - if (VectorMask[i] == Left) { - Mask[i] = i; - VectorMask[i] = In; - } else if (VectorMask[i] == Right) { - Mask[i] = i + NumElems; - VectorMask[i] = In; + bool ElementTakenFromLeft = VectorMask[i] == Left; + bool ElementTakenFromRight = VectorMask[i] == Right; + if (!ElementTakenFromLeft && !ElementTakenFromRight) + continue; + VectorMask[i] = In; + + if (ElementTakenFromLeft) { + if (!IsOneVecShuffleLeft) + Mask[i] = i; + else + Mask[i] = cast(Shuffles[Left])->getMask()[i]; + } else { + if (!IsOneVecShuffleRight) + Mask[i] = i + NumElems; + else + Mask[i] = cast(Shuffles[Right])->getMask()[i] + + NumElems; } } Shuffles[In] = - DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask); + DAG.getVectorShuffle(VT, DL, ToShuffleLeft, ToShuffleRight, Mask); } } return Shuffles[0]; diff --git a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll --- a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll +++ b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll @@ -174,15 +174,13 @@ ; CHECKHARD-NEXT: vmov r1, s0 ; CHECKHARD-NEXT: vmovx.f16 s12, s1 ; CHECKHARD-NEXT: vmov r0, s12 -; CHECKHARD-NEXT: vext.16 d16, d4, d5, #2 +; CHECKHARD-NEXT: vrev32.16 d16, d3 +; CHECKHARD-NEXT: vext.16 d17, d4, d5, #2 ; CHECKHARD-NEXT: vmovx.f16 s12, s4 -; CHECKHARD-NEXT: vdup.16 q11, d3[1] -; CHECKHARD-NEXT: vrev32.16 d17, d16 -; CHECKHARD-NEXT: vext.16 d16, d16, d17, #3 -; CHECKHARD-NEXT: vrev32.16 d17, d3 -; CHECKHARD-NEXT: vext.16 d17, d17, d3, #1 -; CHECKHARD-NEXT: vext.16 d16, d16, d17, #2 -; CHECKHARD-NEXT: vext.16 d17, d16, d16, #2 +; CHECKHARD-NEXT: vext.16 d16, d16, d3, #1 +; CHECKHARD-NEXT: vext.16 d16, d17, d16, #2 +; CHECKHARD-NEXT: vext.16 d16, d16, d17, #1 +; CHECKHARD-NEXT: vext.16 d17, d16, d16, #1 ; CHECKHARD-NEXT: vmov.16 d16[0], r1 ; CHECKHARD-NEXT: vmov.16 d16[1], r0 ; CHECKHARD-NEXT: vmov r0, s3 @@ -194,37 +192,38 @@ ; CHECKHARD-NEXT: vmov.16 d16[3], r0 ; CHECKHARD-NEXT: vmov r0, s2 ; CHECKHARD-NEXT: vmov.16 d18[0], r1 -; CHECKHARD-NEXT: vmov r1, s8 ; CHECKHARD-NEXT: vmov.16 d18[1], r0 ; CHECKHARD-NEXT: vmov r0, s12 +; CHECKHARD-NEXT: vdup.16 q3, d3[1] +; CHECKHARD-NEXT: vmov r1, s12 ; CHECKHARD-NEXT: vmovx.f16 s12, s9 -; CHECKHARD-NEXT: vmov.16 d20[1], r1 ; CHECKHARD-NEXT: vmov.16 d18[2], r0 ; CHECKHARD-NEXT: vmov r0, s5 ; CHECKHARD-NEXT: vmov.16 d18[3], r0 +; CHECKHARD-NEXT: vmov r0, s8 +; CHECKHARD-NEXT: vmov.16 d19[0], r1 +; CHECKHARD-NEXT: vmov.16 d19[1], r0 ; CHECKHARD-NEXT: vmov r0, s12 -; CHECKHARD-NEXT: vmov.16 d20[2], r0 +; CHECKHARD-NEXT: vmov.16 d19[2], r0 ; CHECKHARD-NEXT: vmov r0, s11 -; CHECKHARD-NEXT: vmov.16 d20[3], r0 -; CHECKHARD-NEXT: vmov r0, s10 -; CHECKHARD-NEXT: vext.16 d20, d20, d22, #1 -; CHECKHARD-NEXT: vdup.16 q11, d3[2] -; CHECKHARD-NEXT: vext.16 d19, d20, d20, #3 +; CHECKHARD-NEXT: vmov.16 d19[3], r0 ; CHECKHARD-NEXT: vadd.f16 q8, q8, q9 ; CHECKHARD-NEXT: vext.16 d18, d0, d1, #2 ; CHECKHARD-NEXT: vmovx.f16 s0, s8 -; CHECKHARD-NEXT: vmov r1, s0 -; CHECKHARD-NEXT: vmovx.f16 s0, s11 +; CHECKHARD-NEXT: vmov r0, s0 +; CHECKHARD-NEXT: vdup.16 q0, d3[2] ; CHECKHARD-NEXT: vext.16 d19, d18, d2, #3 +; CHECKHARD-NEXT: vmov r1, s0 ; CHECKHARD-NEXT: vext.16 d18, d2, d18, #1 +; CHECKHARD-NEXT: vmovx.f16 s0, s11 ; CHECKHARD-NEXT: vext.16 d18, d18, d19, #2 ; CHECKHARD-NEXT: vext.16 d18, d18, d18, #1 -; CHECKHARD-NEXT: vmov.16 d20[1], r1 -; CHECKHARD-NEXT: vmov.16 d20[2], r0 +; CHECKHARD-NEXT: vmov.16 d19[0], r1 +; CHECKHARD-NEXT: vmov.16 d19[1], r0 +; CHECKHARD-NEXT: vmov r0, s10 +; CHECKHARD-NEXT: vmov.16 d19[2], r0 ; CHECKHARD-NEXT: vmov r0, s0 -; CHECKHARD-NEXT: vmov.16 d20[3], r0 -; CHECKHARD-NEXT: vext.16 d20, d20, d22, #1 -; CHECKHARD-NEXT: vext.16 d19, d20, d20, #3 +; CHECKHARD-NEXT: vmov.16 d19[3], r0 ; CHECKHARD-NEXT: vadd.f16 q0, q8, q9 ; CHECKHARD-NEXT: bx lr ; @@ -233,15 +232,13 @@ ; CHECKSOFT-NEXT: vmov r1, s0 ; CHECKSOFT-NEXT: vmovx.f16 s12, s1 ; CHECKSOFT-NEXT: vmov r0, s12 -; CHECKSOFT-NEXT: vext.16 d16, d4, d5, #2 +; CHECKSOFT-NEXT: vrev32.16 d16, d3 +; CHECKSOFT-NEXT: vext.16 d17, d4, d5, #2 ; CHECKSOFT-NEXT: vmovx.f16 s12, s4 -; CHECKSOFT-NEXT: vdup.16 q11, d3[1] -; CHECKSOFT-NEXT: vrev32.16 d17, d16 -; CHECKSOFT-NEXT: vext.16 d16, d16, d17, #3 -; CHECKSOFT-NEXT: vrev32.16 d17, d3 -; CHECKSOFT-NEXT: vext.16 d17, d17, d3, #1 -; CHECKSOFT-NEXT: vext.16 d16, d16, d17, #2 -; CHECKSOFT-NEXT: vext.16 d17, d16, d16, #2 +; CHECKSOFT-NEXT: vext.16 d16, d16, d3, #1 +; CHECKSOFT-NEXT: vext.16 d16, d17, d16, #2 +; CHECKSOFT-NEXT: vext.16 d16, d16, d17, #1 +; CHECKSOFT-NEXT: vext.16 d17, d16, d16, #1 ; CHECKSOFT-NEXT: vmov.16 d16[0], r1 ; CHECKSOFT-NEXT: vmov.16 d16[1], r0 ; CHECKSOFT-NEXT: vmov r0, s3 @@ -253,37 +250,38 @@ ; CHECKSOFT-NEXT: vmov.16 d16[3], r0 ; CHECKSOFT-NEXT: vmov r0, s2 ; CHECKSOFT-NEXT: vmov.16 d18[0], r1 -; CHECKSOFT-NEXT: vmov r1, s8 ; CHECKSOFT-NEXT: vmov.16 d18[1], r0 ; CHECKSOFT-NEXT: vmov r0, s12 +; CHECKSOFT-NEXT: vdup.16 q3, d3[1] +; CHECKSOFT-NEXT: vmov r1, s12 ; CHECKSOFT-NEXT: vmovx.f16 s12, s9 -; CHECKSOFT-NEXT: vmov.16 d20[1], r1 ; CHECKSOFT-NEXT: vmov.16 d18[2], r0 ; CHECKSOFT-NEXT: vmov r0, s5 ; CHECKSOFT-NEXT: vmov.16 d18[3], r0 +; CHECKSOFT-NEXT: vmov r0, s8 +; CHECKSOFT-NEXT: vmov.16 d19[0], r1 +; CHECKSOFT-NEXT: vmov.16 d19[1], r0 ; CHECKSOFT-NEXT: vmov r0, s12 -; CHECKSOFT-NEXT: vmov.16 d20[2], r0 +; CHECKSOFT-NEXT: vmov.16 d19[2], r0 ; CHECKSOFT-NEXT: vmov r0, s11 -; CHECKSOFT-NEXT: vmov.16 d20[3], r0 -; CHECKSOFT-NEXT: vmov r0, s10 -; CHECKSOFT-NEXT: vext.16 d20, d20, d22, #1 -; CHECKSOFT-NEXT: vdup.16 q11, d3[2] -; CHECKSOFT-NEXT: vext.16 d19, d20, d20, #3 +; CHECKSOFT-NEXT: vmov.16 d19[3], r0 ; CHECKSOFT-NEXT: vadd.f16 q8, q8, q9 ; CHECKSOFT-NEXT: vext.16 d18, d0, d1, #2 ; CHECKSOFT-NEXT: vmovx.f16 s0, s8 -; CHECKSOFT-NEXT: vmov r1, s0 -; CHECKSOFT-NEXT: vmovx.f16 s0, s11 +; CHECKSOFT-NEXT: vmov r0, s0 +; CHECKSOFT-NEXT: vdup.16 q0, d3[2] ; CHECKSOFT-NEXT: vext.16 d19, d18, d2, #3 +; CHECKSOFT-NEXT: vmov r1, s0 ; CHECKSOFT-NEXT: vext.16 d18, d2, d18, #1 +; CHECKSOFT-NEXT: vmovx.f16 s0, s11 ; CHECKSOFT-NEXT: vext.16 d18, d18, d19, #2 ; CHECKSOFT-NEXT: vext.16 d18, d18, d18, #1 -; CHECKSOFT-NEXT: vmov.16 d20[1], r1 -; CHECKSOFT-NEXT: vmov.16 d20[2], r0 +; CHECKSOFT-NEXT: vmov.16 d19[0], r1 +; CHECKSOFT-NEXT: vmov.16 d19[1], r0 +; CHECKSOFT-NEXT: vmov r0, s10 +; CHECKSOFT-NEXT: vmov.16 d19[2], r0 ; CHECKSOFT-NEXT: vmov r0, s0 -; CHECKSOFT-NEXT: vmov.16 d20[3], r0 -; CHECKSOFT-NEXT: vext.16 d20, d20, d22, #1 -; CHECKSOFT-NEXT: vext.16 d19, d20, d20, #3 +; CHECKSOFT-NEXT: vmov.16 d19[3], r0 ; CHECKSOFT-NEXT: vadd.f16 q0, q8, q9 ; CHECKSOFT-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/PowerPC/pr27078.ll b/llvm/test/CodeGen/PowerPC/pr27078.ll --- a/llvm/test/CodeGen/PowerPC/pr27078.ll +++ b/llvm/test/CodeGen/PowerPC/pr27078.ll @@ -9,10 +9,9 @@ %6 = shufflevector <12 x float> %5, <12 x float> undef, <4 x i32> ret <4 x float> %6 -; CHECK: xxsldwi -; CHECK-DAG: vmrghw +; CHECK: lxvw4x ; CHECK-DAG: vmrglw -; CHECK-NEXT: xxsldwi +; CHECK-DAG: vmrghw ; CHECK-NEXT: xxsldwi ; CHECK-NEXT: xxsldwi ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/vec-itofp.ll b/llvm/test/CodeGen/PowerPC/vec-itofp.ll --- a/llvm/test/CodeGen/PowerPC/vec-itofp.ll +++ b/llvm/test/CodeGen/PowerPC/vec-itofp.ll @@ -312,13 +312,13 @@ ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vperm v3, v2, v4, v3 ; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_2@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_2@toc@l -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vperm v3, v2, v4, v3 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs1, v3 @@ -400,7 +400,7 @@ ; CHECK-BE-NEXT: lxvx v4, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI4_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI4_1@toc@l -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vperm v3, v2, v3, v4 ; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll @@ -378,7 +378,7 @@ ; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vperm v3, v2, v4, v3 ; CHECK-BE-NEXT: vmrghh v2, v2, v2 ; CHECK-BE-NEXT: vextsh2w v3, v3 ; CHECK-BE-NEXT: vextsh2w v2, v2 @@ -455,8 +455,8 @@ ; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: vperm v0, v5, v3, v4 -; CHECK-BE-NEXT: vperm v4, v5, v2, v4 +; CHECK-BE-NEXT: vperm v0, v3, v5, v4 +; CHECK-BE-NEXT: vperm v4, v2, v5, v4 ; CHECK-BE-NEXT: vmrghh v3, v3, v3 ; CHECK-BE-NEXT: vmrghh v2, v2, v2 ; CHECK-BE-NEXT: vextsh2w v0, v0 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll @@ -464,7 +464,7 @@ ; CHECK-BE-NEXT: lxvx v4, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vperm v3, v2, v3, v4 ; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 @@ -569,13 +569,13 @@ ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vperm v3, v2, v4, v3 ; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_2@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_2@toc@l -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vperm v3, v2, v4, v3 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs1, v3 @@ -740,14 +740,14 @@ ; CHECK-BE-NEXT: addis r5, r2, .LCPI7_1@toc@ha ; CHECK-BE-NEXT: addi r5, r5, .LCPI7_1@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r5 -; CHECK-BE-NEXT: vperm v0, v5, v4, v2 -; CHECK-BE-NEXT: vperm v2, v5, v1, v2 +; CHECK-BE-NEXT: vperm v0, v4, v5, v2 +; CHECK-BE-NEXT: vperm v2, v1, v5, v2 ; CHECK-BE-NEXT: vextsh2d v2, v2 ; CHECK-BE-NEXT: vextsh2d v0, v0 ; CHECK-BE-NEXT: xvcvsxddp vs2, v2 -; CHECK-BE-NEXT: vperm v2, v5, v1, v3 +; CHECK-BE-NEXT: vperm v2, v1, v5, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v0 -; CHECK-BE-NEXT: vperm v0, v5, v4, v3 +; CHECK-BE-NEXT: vperm v0, v4, v5, v3 ; CHECK-BE-NEXT: vextsh2d v2, v2 ; CHECK-BE-NEXT: vextsh2d v0, v0 ; CHECK-BE-NEXT: xvcvsxddp vs3, v2 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll @@ -437,7 +437,7 @@ ; CHECK-BE-NEXT: lxvx v4, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vperm v3, v2, v3, v4 ; CHECK-BE-NEXT: vextsb2w v3, v3 ; CHECK-BE-NEXT: xvcvsxwsp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 @@ -536,13 +536,13 @@ ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vperm v3, v2, v4, v3 ; CHECK-BE-NEXT: vextsb2w v3, v3 ; CHECK-BE-NEXT: xvcvsxwsp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vperm v3, v2, v4, v3 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: vextsb2w v3, v3 ; CHECK-BE-NEXT: xvcvsxwsp vs1, v3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll @@ -497,7 +497,7 @@ ; CHECK-BE-NEXT: lxvx v4, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vperm v3, v2, v3, v4 ; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 @@ -605,13 +605,13 @@ ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vperm v3, v2, v4, v3 ; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_2@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_2@toc@l -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vperm v3, v2, v4, v3 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs1, v3 @@ -792,27 +792,27 @@ ; CHECK-BE-NEXT: lxvx v4, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l -; CHECK-BE-NEXT: vperm v4, v3, v2, v4 +; CHECK-BE-NEXT: vperm v4, v2, v3, v4 ; CHECK-BE-NEXT: vextsb2d v4, v4 ; CHECK-BE-NEXT: xvcvsxddp vs0, v4 ; CHECK-BE-NEXT: lxvx v4, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l -; CHECK-BE-NEXT: vperm v4, v3, v2, v4 +; CHECK-BE-NEXT: vperm v4, v2, v3, v4 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: vextsb2d v4, v4 ; CHECK-BE-NEXT: xvcvsxddp vs1, v4 ; CHECK-BE-NEXT: lxvx v4, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_3@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l -; CHECK-BE-NEXT: vperm v4, v3, v2, v4 +; CHECK-BE-NEXT: vperm v4, v2, v3, v4 ; CHECK-BE-NEXT: stxv vs1, 48(r3) ; CHECK-BE-NEXT: vextsb2d v4, v4 ; CHECK-BE-NEXT: xvcvsxddp vs2, v4 ; CHECK-BE-NEXT: lxvx v4, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_4@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_4@toc@l -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vperm v3, v2, v3, v4 ; CHECK-BE-NEXT: stxv vs2, 80(r3) ; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs3, v3 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -135,19 +135,19 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s14, s8 -; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vmov.f32 s16, s1 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s13, s5 -; CHECK-NEXT: vadd.i32 q0, q0, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q3 +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmov.f32 s17, s3 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vmov.f32 s19, s9 +; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vadd.i32 q0, q3, q1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -361,59 +361,55 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) { ; CHECK-LABEL: shuffle3step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r0, q1[0] ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmovx.f16 s18, s6 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vmov.16 q0[6], r0 ; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vins.f16 s17, s5 +; CHECK-NEXT: vmov.16 q0[7], r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmovnb.i32 q6, q4 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vins.f16 s22, s8 -; CHECK-NEXT: vmovx.f16 s23, s9 -; CHECK-NEXT: vins.f16 s23, s11 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s8, s2 -; CHECK-NEXT: vmovx.f16 s9, s3 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vins.f16 s9, s5 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmovnb.i32 q0, q2 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s11, s23 -; CHECK-NEXT: vadd.i16 q0, q3, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vadd.i16 q0, q5, q4 +; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> @@ -759,8 +755,8 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) { ; CHECK-LABEL: shuffle3step_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.8 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] @@ -775,14 +771,14 @@ ; CHECK-NEXT: vmov.8 q3[5], r0 ; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.8 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q3[7], r0 ; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q3[8], r0 +; CHECK-NEXT: vmov.8 q4[8], r0 ; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q3[9], r0 +; CHECK-NEXT: vmov.8 q4[9], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q3[10], r0 +; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.8 q4[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[5] ; CHECK-NEXT: vmov.8 q4[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[8] @@ -791,11 +787,10 @@ ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov.8 q5[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q3[7], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.f32 s14, s18 ; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] @@ -810,14 +805,14 @@ ; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u8 r0, q1[2] ; CHECK-NEXT: vmov.8 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q4[8], r0 +; CHECK-NEXT: vmov.8 q5[8], r0 ; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q4[9], r0 +; CHECK-NEXT: vmov.8 q5[9], r0 ; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.8 q5[10], r0 +; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] @@ -826,11 +821,10 @@ ; CHECK-NEXT: vmov.8 q5[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] ; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov q6, q4 -; CHECK-NEXT: vmov.8 q6[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.f32 s18, s22 ; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vadd.i8 q3, q4, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 @@ -846,6 +840,14 @@ ; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u8 r0, q1[4] ; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.u8 r0, q2[0] +; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.u8 r0, q2[3] +; CHECK-NEXT: vmov.8 q0[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[6] ; CHECK-NEXT: vmov.8 q0[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[9] @@ -854,20 +856,12 @@ ; CHECK-NEXT: vmov.8 q0[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[15] ; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q1[7] ; CHECK-NEXT: vmov.8 q4[7], r0 -; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s18, s2 ; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vadd.i8 q0, q3, q4 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> @@ -1185,19 +1179,19 @@ ; CHECKFP: @ %bb.0: @ %entry ; CHECKFP-NEXT: .vsave {d8, d9} ; CHECKFP-NEXT: vpush {d8, d9} -; CHECKFP-NEXT: vmov.f32 s14, s8 -; CHECKFP-NEXT: vmov.f32 s15, s11 -; CHECKFP-NEXT: vmov.f32 s16, s1 -; CHECKFP-NEXT: vmov.f32 s12, s2 -; CHECKFP-NEXT: vmov.f32 s17, s4 -; CHECKFP-NEXT: vmov.f32 s1, s3 -; CHECKFP-NEXT: vmov.f32 s18, s7 -; CHECKFP-NEXT: vmov.f32 s2, s6 -; CHECKFP-NEXT: vmov.f32 s19, s10 -; CHECKFP-NEXT: vmov.f32 s3, s9 -; CHECKFP-NEXT: vmov.f32 s13, s5 -; CHECKFP-NEXT: vadd.f32 q0, q0, q4 -; CHECKFP-NEXT: vadd.f32 q0, q0, q3 +; CHECKFP-NEXT: vmov.f32 s12, s1 +; CHECKFP-NEXT: vmov.f32 s16, s0 +; CHECKFP-NEXT: vmov.f32 s13, s4 +; CHECKFP-NEXT: vmov.f32 s17, s3 +; CHECKFP-NEXT: vmov.f32 s14, s7 +; CHECKFP-NEXT: vmov.f32 s18, s6 +; CHECKFP-NEXT: vmov.f32 s4, s2 +; CHECKFP-NEXT: vmov.f32 s6, s8 +; CHECKFP-NEXT: vmov.f32 s15, s10 +; CHECKFP-NEXT: vmov.f32 s19, s9 +; CHECKFP-NEXT: vadd.f32 q3, q4, q3 +; CHECKFP-NEXT: vmov.f32 s7, s11 +; CHECKFP-NEXT: vadd.f32 q0, q3, q1 ; CHECKFP-NEXT: vpop {d8, d9} ; CHECKFP-NEXT: bx lr entry: @@ -1399,45 +1393,42 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) { ; CHECKFP-LABEL: shuffle3step_f16: ; CHECKFP: @ %bb.0: @ %entry -; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECKFP-NEXT: .vsave {d8, d9, d10} +; CHECKFP-NEXT: vpush {d8, d9, d10} ; CHECKFP-NEXT: vmovx.f16 s16, s2 ; CHECKFP-NEXT: vmov.f32 s12, s1 ; CHECKFP-NEXT: vins.f16 s12, s16 ; CHECKFP-NEXT: vmovx.f16 s16, s5 ; CHECKFP-NEXT: vmov.f32 s13, s4 -; CHECKFP-NEXT: vmovx.f16 s20, s11 +; CHECKFP-NEXT: vmovx.f16 s20, s1 ; CHECKFP-NEXT: vins.f16 s13, s16 -; CHECKFP-NEXT: vmov.f32 s19, s10 -; CHECKFP-NEXT: vins.f16 s19, s20 +; CHECKFP-NEXT: vmovx.f16 s16, s8 ; CHECKFP-NEXT: vmov.f32 s14, s7 -; CHECKFP-NEXT: vmovx.f16 s20, s8 -; CHECKFP-NEXT: vmovx.f16 s24, s1 -; CHECKFP-NEXT: vins.f16 s14, s20 -; CHECKFP-NEXT: vmov.f32 s20, s0 -; CHECKFP-NEXT: vins.f16 s20, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s4 -; CHECKFP-NEXT: vmov.f32 s21, s3 -; CHECKFP-NEXT: vins.f16 s21, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s7 -; CHECKFP-NEXT: vmov.f32 s22, s6 -; CHECKFP-NEXT: vins.f16 s22, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s0 -; CHECKFP-NEXT: vins.f16 s24, s2 -; CHECKFP-NEXT: vmov.f32 s18, s8 -; CHECKFP-NEXT: vmovx.f16 s25, s3 -; CHECKFP-NEXT: vmovx.f16 s0, s10 -; CHECKFP-NEXT: vins.f16 s25, s5 -; CHECKFP-NEXT: vmov.f32 s15, s19 -; CHECKFP-NEXT: vmovx.f16 s27, s9 -; CHECKFP-NEXT: vins.f16 s9, s0 -; CHECKFP-NEXT: vins.f16 s27, s11 -; CHECKFP-NEXT: vmov.f32 s23, s9 -; CHECKFP-NEXT: vmovx.f16 s26, s6 -; CHECKFP-NEXT: vins.f16 s26, s8 -; CHECKFP-NEXT: vadd.f16 q0, q5, q6 +; CHECKFP-NEXT: vins.f16 s14, s16 +; CHECKFP-NEXT: vmovx.f16 s16, s11 +; CHECKFP-NEXT: vmov.f32 s15, s10 +; CHECKFP-NEXT: vins.f16 s15, s16 +; CHECKFP-NEXT: vmovx.f16 s18, s6 +; CHECKFP-NEXT: vins.f16 s18, s8 +; CHECKFP-NEXT: vmovx.f16 s19, s9 +; CHECKFP-NEXT: vins.f16 s19, s11 +; CHECKFP-NEXT: vmovx.f16 s16, s0 +; CHECKFP-NEXT: vins.f16 s16, s2 +; CHECKFP-NEXT: vmovx.f16 s17, s3 +; CHECKFP-NEXT: vins.f16 s0, s20 +; CHECKFP-NEXT: vmovx.f16 s20, s4 +; CHECKFP-NEXT: vmov.f32 s1, s3 +; CHECKFP-NEXT: vins.f16 s1, s20 +; CHECKFP-NEXT: vmovx.f16 s20, s7 +; CHECKFP-NEXT: vins.f16 s17, s5 +; CHECKFP-NEXT: vins.f16 s6, s20 +; CHECKFP-NEXT: vmov.f32 s2, s6 +; CHECKFP-NEXT: vmovx.f16 s4, s10 +; CHECKFP-NEXT: vins.f16 s9, s4 +; CHECKFP-NEXT: vmov.f32 s3, s9 +; CHECKFP-NEXT: vadd.f16 q0, q0, q4 ; CHECKFP-NEXT: vadd.f16 q0, q0, q3 -; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECKFP-NEXT: vpop {d8, d9, d10} ; CHECKFP-NEXT: bx lr entry: %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -36,22 +36,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.i32 q0, q2, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.i32 q0, q3, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -71,39 +71,39 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.i32 q0, q2, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vadd.i32 q0, q3, q0 ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vadd.i32 q1, q3, q5 -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vadd.i32 q1, q4, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -123,73 +123,73 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.i32 q0, q2, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vadd.i32 q0, q3, q0 ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.f64 d5, d2 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vadd.i32 q1, q4, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0, #176] +; CHECK-NEXT: vmov.f32 s24, s13 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vmov.f32 s26, s11 ; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vadd.i32 q1, q3, q5 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #176] -; CHECK-NEXT: vldrw.u32 q4, [r0, #144] -; CHECK-NEXT: vldrw.u32 q5, [r0, #160] -; CHECK-NEXT: vmov.f64 d7, d4 -; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s25, s20 -; CHECK-NEXT: vmov.f32 s17, s19 -; CHECK-NEXT: vmov.f32 s26, s23 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s27, s10 -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vmov.f32 s13, s21 -; CHECK-NEXT: vadd.i32 q2, q4, q6 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #128] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s27, s18 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmov.f32 s11, s19 +; CHECK-NEXT: vadd.i32 q5, q5, q6 +; CHECK-NEXT: vadd.i32 q2, q5, q2 ; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vldrw.u32 q6, [r0, #112] -; CHECK-NEXT: vmov.f64 d9, d6 +; CHECK-NEXT: vldrw.u32 q4, [r0, #128] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s24, s21 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f64 d14, d10 ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s19, s15 -; CHECK-NEXT: vmov.f32 s28, s21 -; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s29, s24 -; CHECK-NEXT: vmov.f32 s21, s23 -; CHECK-NEXT: vmov.f32 s30, s27 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s31, s14 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s17, s25 -; CHECK-NEXT: vadd.i32 q3, q5, q7 -; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vmov.f32 s29, s23 +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vmov.f32 s12, s22 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmov.f32 s27, s18 +; CHECK-NEXT: vmov.f32 s31, s17 +; CHECK-NEXT: vadd.i32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vadd.i32 q3, q6, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -288,8 +288,8 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] @@ -301,50 +301,46 @@ ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.16 q4[5], r0 ; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.16 q4[7], r0 ; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmovx.f16 s18, s10 +; CHECK-NEXT: vins.f16 s18, s4 ; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.f32 s22, s4 -; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmovx.f16 s19, s5 +; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vins.f16 s19, s7 +; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmovnb.i32 q6, q0 -; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmovx.f16 s16, s12 +; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.f32 s2, s26 -; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vins.f16 s16, s14 +; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.f32 s3, s23 -; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vmovx.f16 s17, s15 +; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vins.f16 s17, s9 +; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmovx.f16 s20, s12 -; CHECK-NEXT: vins.f16 s20, s14 -; CHECK-NEXT: vmovx.f16 s21, s15 -; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vins.f16 s10, s4 -; CHECK-NEXT: vmovx.f16 s11, s5 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vmovnb.i32 q1, q5 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s23, s11 -; CHECK-NEXT: vadd.i16 q1, q4, q5 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.f32 s23, s15 +; CHECK-NEXT: vadd.i16 q1, q5, q4 ; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x i16>, <24 x i16>* %src, align 4 @@ -360,122 +356,109 @@ define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vld3_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov.u16 r2, q4[2] +; CHECK-NEXT: vmovx.f16 s22, s14 +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.u16 r2, q4[5] +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[0] +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vins.f16 s22, s8 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmovx.f16 s23, s9 ; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vins.f16 s23, s11 +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.u16 r2, q2[1] +; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmov.u16 r2, q4[0] +; CHECK-NEXT: vmov.f32 s6, s26 +; CHECK-NEXT: vmovx.f16 s20, s16 +; CHECK-NEXT: vmov.f32 s7, s27 +; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov.u16 r2, q4[3] +; CHECK-NEXT: vins.f16 s20, s18 +; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov.u16 r2, q4[6] +; CHECK-NEXT: vmov.16 q6[2], r2 +; CHECK-NEXT: vmov.u16 r2, q3[1] +; CHECK-NEXT: vmov.16 q6[3], r2 ; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q5[6], r2 +; CHECK-NEXT: vmov.16 q6[4], r2 ; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmovx.f16 s21, s19 +; CHECK-NEXT: vmov.16 q4[6], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vins.f16 s21, s13 +; CHECK-NEXT: vmov.16 q4[7], r2 ; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s12 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmovnb.i32 q6, q4 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vins.f16 s22, s12 -; CHECK-NEXT: vmovx.f16 s23, s13 -; CHECK-NEXT: vins.f16 s23, s15 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vins.f16 s12, s10 -; CHECK-NEXT: vmovx.f16 s13, s11 +; CHECK-NEXT: vmov.16 q6[5], r2 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vins.f16 s13, s5 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmovnb.i32 q1, q3 -; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.f32 s27, s19 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vadd.i16 q3, q6, q5 +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vadd.i16 q1, q3, q1 +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.u16 r0, q4[1] +; CHECK-NEXT: vmov.16 q3[1], r2 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.16 q3[2], r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.f32 s26, s20 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmovnb.i32 q7, q1 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.f32 s6, s30 -; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmov.u16 r2, q3[4] +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vmov.16 q5[6], r0 +; CHECK-NEXT: vmov.u16 r0, q4[7] +; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vmovx.f16 s22, s10 +; CHECK-NEXT: vins.f16 s22, s16 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmovx.f16 s23, s17 +; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vins.f16 s23, s19 +; CHECK-NEXT: vmov.16 q6[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.f32 s7, s27 -; CHECK-NEXT: vins.f16 s26, s20 -; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmovx.f16 s27, s21 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s20, s2 +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmovx.f16 s21, s3 ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vins.f16 s27, s23 -; CHECK-NEXT: vmovx.f16 s20, s12 +; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: vins.f16 s21, s9 ; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vins.f16 s20, s14 ; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmovx.f16 s21, s15 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.f32 s19, s3 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov q0, q6 -; CHECK-NEXT: vmovnb.i32 q0, q5 -; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vadd.i16 q0, q4, q5 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmov.f32 s27, s3 +; CHECK-NEXT: vadd.i16 q0, q6, q5 +; CHECK-NEXT: vadd.i16 q0, q0, q3 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x i16>, <48 x i16>* %src, align 4 @@ -638,8 +621,8 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld3_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] @@ -657,27 +640,26 @@ ; CHECK-NEXT: vmov.8 q3[5], r2 ; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: vmov.8 q3[6], r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: vmov.8 q3[7], r2 ; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q3[8], r2 +; CHECK-NEXT: vmov.8 q4[8], r2 ; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.8 q4[9], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.8 q4[10], r2 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.8 q4[11], r0 +; CHECK-NEXT: vmov.u8 r0, q2[5] ; CHECK-NEXT: vmov.8 q4[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q3[9], r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.8 q4[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q3[10], r2 ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov.8 q5[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q3[7], r0 ; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.f32 s14, s18 ; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[3] @@ -692,14 +674,14 @@ ; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vmov.8 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.8 q4[8], r0 +; CHECK-NEXT: vmov.8 q5[8], r0 ; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.8 q4[9], r0 +; CHECK-NEXT: vmov.8 q5[9], r0 ; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.8 q5[10], r0 +; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] @@ -708,11 +690,10 @@ ; CHECK-NEXT: vmov.8 q5[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] ; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov q6, q4 -; CHECK-NEXT: vmov.8 q6[11], r0 +; CHECK-NEXT: vmov.u8 r0, q0[5] +; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.f32 s18, s22 ; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vadd.i8 q3, q4, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 @@ -728,6 +709,14 @@ ; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] ; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q2[0] +; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.u8 r0, q2[3] +; CHECK-NEXT: vmov.8 q1[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[6] ; CHECK-NEXT: vmov.8 q1[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[9] @@ -736,21 +725,13 @@ ; CHECK-NEXT: vmov.8 q1[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[15] ; CHECK-NEXT: vmov.8 q1[15], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q0[7] ; CHECK-NEXT: vmov.8 q4[7], r0 -; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s18, s6 ; CHECK-NEXT: vmov.f32 s19, s7 ; CHECK-NEXT: vadd.i8 q0, q3, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x i8>, <48 x i8>* %src, align 4 @@ -950,22 +931,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.f32 q0, q2, q4 -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.f32 q0, q3, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -985,39 +966,39 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.f32 q0, q2, q4 -; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vadd.f32 q0, q3, q0 ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vadd.f32 q1, q3, q5 -; CHECK-NEXT: vadd.f32 q1, q1, q2 +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vadd.f32 q1, q4, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -1037,73 +1018,73 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d3, d0 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vadd.f32 q0, q2, q4 -; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vadd.f32 q0, q3, q0 ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.f64 d5, d2 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vadd.f32 q1, q4, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0, #176] +; CHECK-NEXT: vmov.f32 s24, s13 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vmov.f32 s26, s11 ; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s21, s16 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s9, s17 -; CHECK-NEXT: vadd.f32 q1, q3, q5 -; CHECK-NEXT: vadd.f32 q1, q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #176] -; CHECK-NEXT: vldrw.u32 q4, [r0, #144] -; CHECK-NEXT: vldrw.u32 q5, [r0, #160] -; CHECK-NEXT: vmov.f64 d7, d4 -; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s25, s20 -; CHECK-NEXT: vmov.f32 s17, s19 -; CHECK-NEXT: vmov.f32 s26, s23 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s27, s10 -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vmov.f32 s13, s21 -; CHECK-NEXT: vadd.f32 q2, q4, q6 -; CHECK-NEXT: vadd.f32 q2, q2, q3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #128] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s27, s18 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmov.f32 s11, s19 +; CHECK-NEXT: vadd.f32 q5, q5, q6 +; CHECK-NEXT: vadd.f32 q2, q5, q2 ; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vldrw.u32 q6, [r0, #112] -; CHECK-NEXT: vmov.f64 d9, d6 +; CHECK-NEXT: vldrw.u32 q4, [r0, #128] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s24, s21 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f64 d14, d10 ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s19, s15 -; CHECK-NEXT: vmov.f32 s28, s21 -; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s29, s24 -; CHECK-NEXT: vmov.f32 s21, s23 -; CHECK-NEXT: vmov.f32 s30, s27 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s31, s14 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s17, s25 -; CHECK-NEXT: vadd.f32 q3, q5, q7 -; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vmov.f32 s29, s23 +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vmov.f32 s12, s22 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmov.f32 s27, s18 +; CHECK-NEXT: vmov.f32 s31, s17 +; CHECK-NEXT: vadd.f32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vadd.f32 q3, q6, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -1198,49 +1179,46 @@ define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmovx.f16 s20, s19 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmov.f32 s11, s18 -; CHECK-NEXT: vins.f16 s11, s20 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmovx.f16 s20, s16 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmov.f64 d10, d6 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s4 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vins.f16 s24, s14 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmovx.f16 s25, s15 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s25, s5 -; CHECK-NEXT: vmovx.f16 s27, s17 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vins.f16 s27, s19 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vmovx.f16 s26, s6 -; CHECK-NEXT: vins.f16 s26, s16 -; CHECK-NEXT: vadd.f16 q1, q5, q6 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmovx.f16 s20, s1 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vins.f16 s5, s12 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmovx.f16 s16, s12 +; CHECK-NEXT: vins.f16 s6, s16 +; CHECK-NEXT: vmovx.f16 s16, s15 +; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vins.f16 s7, s16 +; CHECK-NEXT: vmovx.f16 s18, s10 +; CHECK-NEXT: vins.f16 s18, s12 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s19, s15 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vins.f16 s0, s20 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vins.f16 s1, s20 +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vins.f16 s17, s9 +; CHECK-NEXT: vins.f16 s10, s20 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vins.f16 s13, s8 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 @@ -1256,89 +1234,83 @@ define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld3_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmovx.f16 s20, s19 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmov.f32 s11, s18 -; CHECK-NEXT: vins.f16 s11, s20 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmovx.f16 s20, s16 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmov.f64 d10, d6 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s4 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vins.f16 s24, s14 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmovx.f16 s25, s15 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s25, s5 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmovx.f16 s27, s17 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vins.f16 s27, s19 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vmovx.f16 s26, s6 -; CHECK-NEXT: vins.f16 s26, s16 -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vadd.f16 q1, q5, q6 -; CHECK-NEXT: vadd.f16 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vmov.f32 s0, s17 -; CHECK-NEXT: vmovx.f16 s20, s7 -; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vmovx.f16 s20, s1 ; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vins.f16 s1, s12 -; CHECK-NEXT: vmov.f32 s15, s6 -; CHECK-NEXT: vins.f16 s15, s20 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmov.f64 d10, d8 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmov.f32 s21, s19 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmovx.f16 s24, s11 -; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s24, s16 -; CHECK-NEXT: vins.f16 s24, s18 -; CHECK-NEXT: vmov.f32 s3, s15 -; CHECK-NEXT: vmovx.f16 s25, s19 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vins.f16 s25, s9 -; CHECK-NEXT: vmovx.f16 s27, s5 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vins.f16 s5, s12 +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmovx.f16 s16, s12 +; CHECK-NEXT: vins.f16 s6, s16 +; CHECK-NEXT: vmovx.f16 s16, s15 +; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vins.f16 s7, s16 +; CHECK-NEXT: vmovx.f16 s18, s10 +; CHECK-NEXT: vins.f16 s18, s12 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s19, s15 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vins.f16 s0, s20 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vins.f16 s1, s20 +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vins.f16 s17, s9 +; CHECK-NEXT: vins.f16 s10, s20 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vins.f16 s13, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q1, q0, q1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vmovx.f16 s16, s2 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vins.f16 s4, s16 +; CHECK-NEXT: vmovx.f16 s16, s13 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vmovx.f16 s20, s1 ; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vins.f16 s27, s7 -; CHECK-NEXT: vmov.f32 s23, s5 -; CHECK-NEXT: vmovx.f16 s26, s10 -; CHECK-NEXT: vins.f16 s26, s4 -; CHECK-NEXT: vadd.f16 q1, q5, q6 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vins.f16 s6, s16 +; CHECK-NEXT: vmovx.f16 s16, s11 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vins.f16 s7, s16 +; CHECK-NEXT: vmovx.f16 s18, s14 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vins.f16 s0, s20 +; CHECK-NEXT: vmovx.f16 s20, s12 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vins.f16 s1, s20 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vins.f16 s17, s13 +; CHECK-NEXT: vins.f16 s14, s20 +; CHECK-NEXT: vmov.f32 s2, s14 +; CHECK-NEXT: vmovx.f16 s12, s10 +; CHECK-NEXT: vins.f16 s9, s12 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -267,20 +267,21 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: ldrh.w lr, [r0, #4] -; CHECK-NEXT: ldrh.w r12, [r0, #8] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrh r3, [r0, #2] -; CHECK-NEXT: vmov q1[2], q1[0], lr, r2 -; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: ldrh r0, [r0, #10] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: ldrh r2, [r0, #10] +; CHECK-NEXT: ldrh r4, [r0, #8] +; CHECK-NEXT: ldrh.w r12, [r0, #2] +; CHECK-NEXT: ldrh.w lr, [r0] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 +; CHECK-NEXT: ldrh r3, [r0, #6] +; CHECK-NEXT: ldrh r0, [r0, #4] +; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmovnt.i32 q2, q0 +; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 +; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s3, s2 -; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q0[2], r4 ; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: str r0, [r1, #8] ; CHECK-NEXT: pop {r4, pc} @@ -347,77 +348,70 @@ define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) { ; CHECK-LABEL: vst3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.f32 s18, s12 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vins.f16 s3, s5 -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f64 d2, d4 +; CHECK-NEXT: vmov.u16 r2, q4[2] +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q4[4] +; CHECK-NEXT: vins.f16 s4, s12 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vins.f16 s7, s13 +; CHECK-NEXT: vmov.u16 r2, q4[0] +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmov.16 q5[2], r2 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: vmov.u16 r0, q4[1] ; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.f32 s1, s21 -; CHECK-NEXT: vins.f16 s17, s7 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.u16 r2, q5[2] -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.f32 s5, s21 +; CHECK-NEXT: vmov.f32 s6, s22 ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: vmov.f32 s1, s17 ; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vins.f16 s21, s15 +; CHECK-NEXT: vmov.f32 s2, s10 ; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.u16 r0, q4[7] ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vmov.f32 s21, s13 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s22, s19 +; CHECK-NEXT: vmov.16 q4[2], r2 ; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.f32 s17, s25 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s21, s9 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vstrw.32 q5, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vins.f16 s2, s14 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.f32 s21, s17 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmov.f32 s22, s18 +; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -438,164 +432,151 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #112 -; CHECK-NEXT: sub sp, #112 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vmov.f64 d12, d2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s24, s0 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov.f32 s27, s5 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vins.f16 s27, s1 -; CHECK-NEXT: vmov.f32 s13, s4 -; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s25, s8 -; CHECK-NEXT: vmov.u16 r3, q3[2] -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r2, q6[3] -; CHECK-NEXT: vmov.16 q3[2], r3 -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.u16 r2, q6[4] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: .pad #96 +; CHECK-NEXT: sub sp, #96 +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q7, [r0] ; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vins.f16 s17, s7 +; CHECK-NEXT: vmov.u16 r3, q2[6] +; CHECK-NEXT: vmov.16 q4[6], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vins.f16 s21, s3 +; CHECK-NEXT: vmov.16 q4[7], r2 +; CHECK-NEXT: vmov.f64 d12, d14 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: vmov.u16 r2, q4[3] +; CHECK-NEXT: vmov.u16 r3, q5[0] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.u16 r2, q4[4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s18, s2 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r3, q2[2] -; CHECK-NEXT: vmov.f32 s22, s7 -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u16 r2, q5[3] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q5[4] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s16, s0 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vstrw.32 q5, [r1, #80] -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vins.f16 s19, s1 -; CHECK-NEXT: vmov.f32 s17, s8 -; CHECK-NEXT: vmov.f32 s9, s28 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.f32 s10, s28 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.f32 s17, s5 +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vins.f16 s11, s5 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vins.f16 s24, s4 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s27, s29 +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vins.f16 s27, s5 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.f32 s25, s28 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u16 r2, q6[3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.u16 r0, q6[4] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u16 r0, q5[1] +; CHECK-NEXT: vmov.f32 s25, s1 +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.f32 s26, s2 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov.u16 r2, q7[6] +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vins.f16 s13, s3 +; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q7[7] +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.u16 r0, q7[3] +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov.u16 r0, q7[5] +; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov q7, q0 ; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vins.f16 s9, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r0, q0[4] ; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q7[7] -; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.u16 r0, q1[5] ; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f32 s10, s31 -; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s9, s29 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.f32 s29, s5 ; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.f32 s25, s13 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vmov.u16 r0, q7[2] -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] +; CHECK-NEXT: vmov.u16 r0, q5[3] +; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vstrw.32 q7, [r1, #48] ; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q7[4] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vins.f16 s22, s6 +; CHECK-NEXT: vmov.f32 s13, s17 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vstrw.32 q6, [r1] +; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s5, s29 -; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s14 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vmov.u16 r2, q7[2] -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vldrw.u32 q7, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vmov.f32 s6, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [r1, #64] -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q7[3] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s30 -; CHECK-NEXT: vrev32.16 q3, q3 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q7[2], r2 -; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov.f32 s1, s29 -; CHECK-NEXT: vmov.f32 s2, s30 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: add sp, #112 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.f32 s22, s2 +; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q5, [r1, #16] +; CHECK-NEXT: vstrw.32 q0, [r1, #80] +; CHECK-NEXT: add sp, #96 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -717,8 +698,8 @@ define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { ; CHECK-LABEL: vst3_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrb.u16 q1, [r0, #8] ; CHECK-NEXT: vldrb.u16 q2, [r0, #16] ; CHECK-NEXT: vldrb.u16 q3, [r0] @@ -728,23 +709,24 @@ ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] ; CHECK-NEXT: vins.f16 s1, s7 -; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.u16 r0, q3[6] ; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.16 q4[2], r0 ; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q5[3], r2 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov.16 q4[5], r0 ; CHECK-NEXT: vmov.u16 r0, q3[0] +; CHECK-NEXT: vmov.f32 s1, s17 +; CHECK-NEXT: vmov.f32 s2, s18 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vstrb.16 q0, [r1, #16] ; CHECK-NEXT: vmov.8 q4[1], r0 ; CHECK-NEXT: vmov.u16 r0, q2[0] ; CHECK-NEXT: vmov.8 q4[2], r0 @@ -771,14 +753,11 @@ ; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.8 q4[13], r0 ; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.f32 s1, s21 ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.f32 s2, s22 ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vstrb.16 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 @@ -797,204 +776,177 @@ define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) { ; CHECK-LABEL: vst3_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.u8 r3, q3[0] -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q5[0], r3 -; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.8 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q2[5] +; CHECK-NEXT: vmov.8 q4[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov.8 q4[3], r2 +; CHECK-NEXT: vmov.u8 r2, q2[6] +; CHECK-NEXT: vmov.8 q4[4], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.8 q4[6], r2 +; CHECK-NEXT: vmov.u8 r2, q2[7] +; CHECK-NEXT: vmov.8 q4[7], r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov.8 q4[9], r2 +; CHECK-NEXT: vmov.u8 r2, q2[8] +; CHECK-NEXT: vmov.8 q4[10], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.8 q4[12], r2 +; CHECK-NEXT: vmov.u8 r2, q2[9] +; CHECK-NEXT: vmov.8 q4[13], r2 +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov.8 q4[15], r2 +; CHECK-NEXT: vmov.u8 r0, q3[6] +; CHECK-NEXT: vmov.u8 r2, q4[0] +; CHECK-NEXT: vmov.8 q1[0], r2 +; CHECK-NEXT: vmov.u8 r2, q4[1] +; CHECK-NEXT: vmov.8 q1[1], r2 +; CHECK-NEXT: vmov.8 q1[2], r0 +; CHECK-NEXT: vmov.u8 r0, q4[3] +; CHECK-NEXT: vmov.8 q1[3], r0 +; CHECK-NEXT: vmov.u8 r0, q4[4] +; CHECK-NEXT: vmov.8 q1[4], r0 +; CHECK-NEXT: vmov.u8 r0, q3[7] +; CHECK-NEXT: vmov.8 q1[5], r0 +; CHECK-NEXT: vmov.u8 r0, q4[6] +; CHECK-NEXT: vmov.8 q1[6], r0 +; CHECK-NEXT: vmov.u8 r0, q4[7] +; CHECK-NEXT: vmov.8 q1[7], r0 +; CHECK-NEXT: vmov.u8 r0, q3[8] +; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.u8 r0, q4[9] +; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.u8 r0, q4[10] +; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.u8 r0, q3[9] +; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.u8 r0, q4[12] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q4[13] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q3[10] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q4[15] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vmov.u8 r0, q3[0] +; CHECK-NEXT: vmov.8 q5[0], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] ; CHECK-NEXT: vmov.8 q5[1], r0 ; CHECK-NEXT: vmov.u8 r0, q3[1] ; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.8 q5[4], r0 ; CHECK-NEXT: vmov.u8 r0, q3[2] ; CHECK-NEXT: vmov.8 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vmov.8 q5[7], r0 ; CHECK-NEXT: vmov.u8 r0, q3[3] ; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] +; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.8 q5[10], r0 ; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.8 q4[2], r2 -; CHECK-NEXT: vmov.u8 r2, q1[2] ; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q4[8], r2 -; CHECK-NEXT: vmov.u8 r2, q1[3] +; CHECK-NEXT: vmov.u8 r0, q0[4] ; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q3[5] ; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.8 q4[11], r2 -; CHECK-NEXT: vmov.u8 r2, q1[4] -; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vmov.8 q4[14], r2 -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.u8 r0, q5[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r2, q4[2] -; CHECK-NEXT: vmov.8 q0[2], r2 -; CHECK-NEXT: vmov.u8 r0, q5[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q5[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q4[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q5[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q5[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q4[8] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q5[9] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q5[10] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q4[11] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q5[12] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q5[13] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q4[14] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q5[15] -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vmov.u8 r0, q5[0] ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] ; CHECK-NEXT: vmov.8 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[7] -; CHECK-NEXT: vmov.8 q6[5], r0 -; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.8 q6[8], r0 -; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.f32 s24, s13 -; CHECK-NEXT: vmov.f32 s27, s14 -; CHECK-NEXT: vmov.u8 r0, q6[2] +; CHECK-NEXT: vmov.u8 r0, q2[0] ; CHECK-NEXT: vmov.8 q4[2], r0 ; CHECK-NEXT: vmov.u8 r0, q5[3] ; CHECK-NEXT: vmov.8 q4[3], r0 ; CHECK-NEXT: vmov.u8 r0, q5[4] ; CHECK-NEXT: vmov.8 q4[4], r0 -; CHECK-NEXT: vmov.u8 r0, q6[5] +; CHECK-NEXT: vmov.u8 r0, q2[1] ; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u8 r0, q5[6] ; CHECK-NEXT: vmov.8 q4[6], r0 ; CHECK-NEXT: vmov.u8 r0, q5[7] ; CHECK-NEXT: vmov.8 q4[7], r0 -; CHECK-NEXT: vmov.u8 r0, q6[8] +; CHECK-NEXT: vmov.u8 r0, q2[2] ; CHECK-NEXT: vmov.8 q4[8], r0 ; CHECK-NEXT: vmov.u8 r0, q5[9] ; CHECK-NEXT: vmov.8 q4[9], r0 ; CHECK-NEXT: vmov.u8 r0, q5[10] ; CHECK-NEXT: vmov.8 q4[10], r0 -; CHECK-NEXT: vmov.u8 r0, q6[11] +; CHECK-NEXT: vmov.u8 r0, q2[3] ; CHECK-NEXT: vmov.8 q4[11], r0 ; CHECK-NEXT: vmov.u8 r0, q5[12] ; CHECK-NEXT: vmov.8 q4[12], r0 ; CHECK-NEXT: vmov.u8 r0, q5[13] ; CHECK-NEXT: vmov.8 q4[13], r0 -; CHECK-NEXT: vmov.u8 r0, q6[14] +; CHECK-NEXT: vmov.u8 r0, q2[4] ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u8 r0, q5[15] ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.u8 r0, q2[10] ; CHECK-NEXT: vmov.8 q5[0], r0 ; CHECK-NEXT: vmov.u8 r0, q3[11] ; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.u8 r0, q2[11] ; CHECK-NEXT: vmov.8 q5[3], r0 ; CHECK-NEXT: vmov.u8 r0, q3[12] ; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.u8 r0, q2[12] ; CHECK-NEXT: vmov.8 q5[6], r0 ; CHECK-NEXT: vmov.u8 r0, q3[13] ; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.u8 r0, q2[13] ; CHECK-NEXT: vmov.8 q5[9], r0 ; CHECK-NEXT: vmov.u8 r0, q3[14] ; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q3[15] ; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.u8 r0, q2[15] ; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vmov.8 q1[0], r0 +; CHECK-NEXT: vmov.8 q2[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] -; CHECK-NEXT: vmov.8 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q3[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q3[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q3[14], r0 -; CHECK-NEXT: vmov.u8 r0, q3[2] -; CHECK-NEXT: vmov.8 q1[2], r0 +; CHECK-NEXT: vmov.8 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.8 q2[2], r0 ; CHECK-NEXT: vmov.u8 r0, q5[3] -; CHECK-NEXT: vmov.8 q1[3], r0 +; CHECK-NEXT: vmov.8 q2[3], r0 ; CHECK-NEXT: vmov.u8 r0, q5[4] -; CHECK-NEXT: vmov.8 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q3[5] -; CHECK-NEXT: vmov.8 q1[5], r0 +; CHECK-NEXT: vmov.8 q2[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q2[5], r0 ; CHECK-NEXT: vmov.u8 r0, q5[6] -; CHECK-NEXT: vmov.8 q1[6], r0 +; CHECK-NEXT: vmov.8 q2[6], r0 ; CHECK-NEXT: vmov.u8 r0, q5[7] -; CHECK-NEXT: vmov.8 q1[7], r0 -; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.8 q2[7], r0 +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.8 q2[8], r0 ; CHECK-NEXT: vmov.u8 r0, q5[9] -; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.8 q2[9], r0 ; CHECK-NEXT: vmov.u8 r0, q5[10] -; CHECK-NEXT: vmov.8 q1[10], r0 -; CHECK-NEXT: vmov.u8 r0, q3[11] -; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.8 q2[10], r0 +; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.8 q2[11], r0 ; CHECK-NEXT: vmov.u8 r0, q5[12] -; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.8 q2[12], r0 ; CHECK-NEXT: vmov.u8 r0, q5[13] -; CHECK-NEXT: vmov.8 q1[13], r0 -; CHECK-NEXT: vmov.u8 r0, q3[14] -; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.8 q2[13], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q2[14], r0 ; CHECK-NEXT: vmov.u8 r0, q5[15] -; CHECK-NEXT: vmov.8 q1[15], r0 -; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.8 q2[15], r0 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 @@ -1437,65 +1389,53 @@ define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) { ; CHECK-LABEL: vst3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s20, s12 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s16, s12 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s18, s8 ; CHECK-NEXT: vmov.f64 d0, d6 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vins.f16 s8, s16 +; CHECK-NEXT: vmovx.f16 s20, s10 +; CHECK-NEXT: vmovx.f16 s22, s15 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.f32 s18, s4 ; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vins.f16 s3, s9 -; CHECK-NEXT: vmovx.f16 s20, s18 +; CHECK-NEXT: vins.f16 s3, s5 ; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmovx.f16 s23, s11 -; CHECK-NEXT: vrev32.16 q2, q2 -; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vmov.f32 s25, s15 -; CHECK-NEXT: vmov.f32 s22, s7 -; CHECK-NEXT: vmovx.f16 s28, s21 -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vins.f16 s25, s28 -; CHECK-NEXT: vmovx.f16 s28, s26 -; CHECK-NEXT: vins.f16 s22, s28 -; CHECK-NEXT: vmovx.f16 s28, s13 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vins.f16 s4, s28 -; CHECK-NEXT: vmovx.f16 s28, s14 -; CHECK-NEXT: vins.f16 s6, s28 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s7, s6 -; CHECK-NEXT: vmov.f32 s6, s14 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vins.f16 s9, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vins.f16 s6, s12 -; CHECK-NEXT: vmov.f32 s26, s22 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s21, s25 -; CHECK-NEXT: vmov.f32 s5, s9 -; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vins.f16 s2, s18 +; CHECK-NEXT: vmovx.f16 s16, s6 +; CHECK-NEXT: vins.f16 s16, s20 +; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vmovx.f16 s19, s7 +; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: vins.f16 s15, s20 +; CHECK-NEXT: vmovx.f16 s20, s13 +; CHECK-NEXT: vmov.f32 s8, s9 +; CHECK-NEXT: vins.f16 s8, s20 +; CHECK-NEXT: vmovx.f16 s20, s14 +; CHECK-NEXT: vins.f16 s10, s20 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: vins.f16 s18, s22 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmovx.f16 s9, s5 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vins.f16 s9, s12 +; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -1516,146 +1456,109 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #144 -; CHECK-NEXT: sub sp, #144 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q7, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmovx.f16 s0, s22 -; CHECK-NEXT: vmovx.f16 s4, s10 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmovx.f16 s0, s23 -; CHECK-NEXT: vmovx.f16 s7, s11 -; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmov.f32 s9, s31 -; CHECK-NEXT: vmov.f32 s6, s23 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov.f32 s10, s31 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vins.f16 s9, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vmov.f64 d0, d6 -; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s0, s16 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vmov.f32 s9, s24 -; CHECK-NEXT: vins.f16 s3, s17 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.f32 s10, s24 -; CHECK-NEXT: vins.f16 s9, s0 -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s20 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov.f32 s10, s20 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.f64 d0, d14 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmov.f32 s20, s21 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.f32 s3, s29 -; CHECK-NEXT: vins.f16 s3, s5 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s28 -; CHECK-NEXT: vins.f16 s9, s0 -; CHECK-NEXT: vmov.f32 s5, s28 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov q6, q3 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmovx.f16 s11, s27 +; CHECK-NEXT: vins.f16 s11, s0 +; CHECK-NEXT: vmovx.f16 s2, s23 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vins.f16 s23, s0 +; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s9, s23 +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vins.f16 s10, s2 ; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s16, s26 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vins.f16 s8, s16 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s11, s7 -; CHECK-NEXT: vmovx.f16 s16, s27 -; CHECK-NEXT: vmov q1, q3 -; CHECK-NEXT: vins.f16 s11, s16 -; CHECK-NEXT: vmov.f32 s1, s7 -; CHECK-NEXT: vmov.f32 s10, s27 -; CHECK-NEXT: vmovx.f16 s16, s9 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vins.f16 s1, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vins.f16 s10, s16 -; CHECK-NEXT: vmovx.f16 s16, s29 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vins.f16 s20, s16 -; CHECK-NEXT: vmovx.f16 s16, s30 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov.f64 d14, d0 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s22, s16 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s23, s22 -; CHECK-NEXT: vmov.f32 s22, s30 -; CHECK-NEXT: vrev32.16 q3, q0 -; CHECK-NEXT: vmovx.f16 s16, s21 -; CHECK-NEXT: vmov.f32 s24, s25 -; CHECK-NEXT: vins.f16 s13, s16 +; CHECK-NEXT: vins.f16 s28, s8 +; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vmov.f32 s31, s1 +; CHECK-NEXT: vins.f16 s31, s9 +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vmov.f64 d12, d10 +; CHECK-NEXT: vmov.f32 s29, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmovx.f16 s2, s12 +; CHECK-NEXT: vins.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vins.f16 s24, s8 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov.f32 s29, s12 +; CHECK-NEXT: vmov.f32 s27, s21 +; CHECK-NEXT: vins.f16 s30, s2 +; CHECK-NEXT: vins.f16 s27, s9 +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vmov.f32 s25, s20 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmov.f32 s25, s4 +; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vins.f16 s26, s2 ; CHECK-NEXT: vmovx.f16 s16, s14 -; CHECK-NEXT: vins.f16 s22, s16 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vins.f16 s24, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vins.f16 s26, s16 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s27, s26 -; CHECK-NEXT: vmov.f32 s26, s6 -; CHECK-NEXT: vrev32.16 q4, q0 -; CHECK-NEXT: vmovx.f16 s4, s25 -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s17, s4 -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vmovx.f16 s28, s18 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vins.f16 s26, s28 -; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vins.f16 s0, s16 +; CHECK-NEXT: vmovx.f16 s16, s15 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s3, s11 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vins.f16 s3, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s15 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s10, s19 +; CHECK-NEXT: vins.f16 s19, s8 +; CHECK-NEXT: vmovx.f16 s8, s21 +; CHECK-NEXT: vmov.f32 s12, s13 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s8, s22 +; CHECK-NEXT: vins.f16 s6, s8 +; CHECK-NEXT: vmov.f32 s1, s19 +; CHECK-NEXT: vmov.f32 s7, s6 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vmov.f32 s6, s22 +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] +; CHECK-NEXT: vmovx.f16 s5, s21 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vstrw.32 q7, [r1] -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s21, s13 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s25, s17 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vmov.f32 s22, s14 +; CHECK-NEXT: vins.f16 s5, s8 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vins.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s8, s18 +; CHECK-NEXT: vins.f16 s14, s8 +; CHECK-NEXT: vins.f16 s6, s22 +; CHECK-NEXT: vmov.f32 s15, s14 +; CHECK-NEXT: vstrw.32 q6, [r1, #48] +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s8, s13 +; CHECK-NEXT: vstrw.32 q1, [r1, #64] +; CHECK-NEXT: vmovx.f16 s13, s17 ; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vmov.f32 s26, s18 -; CHECK-NEXT: vstrw.32 q5, [r1, #64] -; CHECK-NEXT: vstrw.32 q6, [r1, #16] -; CHECK-NEXT: add sp, #144 +; CHECK-NEXT: vins.f16 s13, s8 +; CHECK-NEXT: vstrw.32 q7, [r1] +; CHECK-NEXT: vins.f16 s14, s18 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -203,39 +203,32 @@ define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vst4_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.f32 s16, s8 -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.f32 s20, s11 -; CHECK-NEXT: vmov.f32 s8, s10 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s22, s26 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.f32 s20, s15 +; CHECK-NEXT: vmov.f32 s5, s9 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s22, s3 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f64 d8, d6 +; CHECK-NEXT: vstrb.8 q1, [r1, #16] +; CHECK-NEXT: vmov.f32 s17, s8 +; CHECK-NEXT: vmov.f32 s18, s0 +; CHECK-NEXT: vmov.f32 s19, s0 +; CHECK-NEXT: vmov.f32 s23, s3 ; CHECK-NEXT: vstrb.8 q4, [r1] -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s0, s14 ; CHECK-NEXT: vstrb.8 q5, [r1, #48] -; CHECK-NEXT: vstrb.8 q2, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s3, s2 +; CHECK-NEXT: vstrb.8 q0, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll --- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll +++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll @@ -11,7 +11,7 @@ ; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; SSE-32-NEXT: xorps %xmm0, %xmm0 ; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,0] +; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] ; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] ; SSE-32-NEXT: movups %xmm0, 624(%eax) ; SSE-32-NEXT: movups %xmm1, 608(%eax) @@ -21,7 +21,7 @@ ; SSE-64: # %bb.0: # %L.entry ; SSE-64-NEXT: xorps %xmm0, %xmm0 ; SSE-64-NEXT: xorps %xmm1, %xmm1 -; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,0] +; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] ; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] ; SSE-64-NEXT: movups %xmm0, 624(%rsi) ; SSE-64-NEXT: movups %xmm1, 608(%rsi) diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -521,7 +521,7 @@ ; SSE2-NEXT: movaps %xmm2, %xmm5 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[3,0] ; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,2] ; SSE2-NEXT: movaps %xmm2, 32(%rdi) @@ -1210,36 +1210,36 @@ ; SSE2-NEXT: movups 80(%rdi), %xmm8 ; SSE2-NEXT: movups 64(%rdi), %xmm3 ; SSE2-NEXT: movdqu (%rdi), %xmm1 -; SSE2-NEXT: movups 16(%rdi), %xmm5 +; SSE2-NEXT: movups 16(%rdi), %xmm6 ; SSE2-NEXT: movups 32(%rdi), %xmm10 ; SSE2-NEXT: movdqu 48(%rdi), %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: movaps %xmm10, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm5[3,3] +; SSE2-NEXT: movaps %xmm6, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm10[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm5[0,2] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: movaps %xmm8, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm6[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: movaps %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] ; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm8[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm3[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm8[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm10[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[0,3] -; SSE2-NEXT: movups %xmm5, 16(%rsi) +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,3] +; SSE2-NEXT: movups %xmm6, 16(%rsi) ; SSE2-NEXT: movups %xmm11, (%rsi) ; SSE2-NEXT: movups %xmm2, 16(%rdx) ; SSE2-NEXT: movups %xmm1, (%rdx) -; SSE2-NEXT: movups %xmm6, 16(%rcx) +; SSE2-NEXT: movups %xmm5, 16(%rcx) ; SSE2-NEXT: movups %xmm0, (%rcx) ; SSE2-NEXT: retq ; @@ -1247,38 +1247,36 @@ ; SSE42: # %bb.0: ; SSE42-NEXT: movups 80(%rdi), %xmm8 ; SSE42-NEXT: movdqu 64(%rdi), %xmm9 -; SSE42-NEXT: movdqu (%rdi), %xmm4 +; SSE42-NEXT: movdqu (%rdi), %xmm3 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2 ; SSE42-NEXT: movups 32(%rdi), %xmm10 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 ; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] -; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[1] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[1] ; SSE42-NEXT: movdqa %xmm9, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm4[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,3] ; SSE42-NEXT: movups %xmm5, 16(%rsi) -; SSE42-NEXT: movups %xmm4, (%rsi) -; SSE42-NEXT: movdqu %xmm3, 16(%rdx) +; SSE42-NEXT: movups %xmm3, (%rsi) +; SSE42-NEXT: movdqu %xmm4, 16(%rdx) ; SSE42-NEXT: movdqu %xmm6, (%rdx) -; SSE42-NEXT: movdqu %xmm2, 16(%rcx) -; SSE42-NEXT: movdqu %xmm1, (%rcx) +; SSE42-NEXT: movups %xmm0, 16(%rcx) +; SSE42-NEXT: movups %xmm7, (%rcx) ; SSE42-NEXT: retq ; ; AVX1-LABEL: interleave_24i32_out: @@ -1445,7 +1443,7 @@ ; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[0,2] ; SSE2-NEXT: movaps %xmm5, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[3,0] ; SSE2-NEXT: movaps %xmm6, %xmm4 ; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,2] @@ -1458,7 +1456,7 @@ ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2] ; SSE2-NEXT: movaps %xmm8, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm3[3,0] ; SSE2-NEXT: movaps %xmm3, %xmm6 ; SSE2-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[0,2] diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -14,38 +14,39 @@ ; CHECK-NEXT: vmovaps %ymm4, %ymm10 ; CHECK-NEXT: vmovaps %ymm3, %ymm9 ; CHECK-NEXT: vmovaps %ymm1, %ymm8 -; CHECK-NEXT: vmovaps %ymm0, %ymm4 +; CHECK-NEXT: vmovaps %ymm0, %ymm3 ; CHECK-NEXT: vmovaps 240(%rbp), %ymm1 -; CHECK-NEXT: vmovaps 208(%rbp), %ymm3 +; CHECK-NEXT: vmovaps 208(%rbp), %ymm4 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 144(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 112(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 80(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm11 -; CHECK-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vmovaps %xmm3, %xmm8 +; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; CHECK-NEXT: vmovaps %xmm4, %xmm6 ; CHECK-NEXT: # implicit-def: $ymm2 -; CHECK-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 +; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0] ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; CHECK-NEXT: vmovaps %xmm7, %xmm2 -; CHECK-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; CHECK-NEXT: # implicit-def: $ymm2 -; CHECK-NEXT: vmovaps %xmm4, %xmm2 -; CHECK-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3] -; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,0,1,4,5,4,5] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2 -; CHECK-NEXT: vmovq {{.*#+}} xmm4 = xmm2[0],zero +; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero ; CHECK-NEXT: # implicit-def: $ymm2 -; CHECK-NEXT: vmovaps %xmm4, %xmm2 -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm6[0,1] +; CHECK-NEXT: vmovaps %xmm6, %xmm2 +; CHECK-NEXT: # kill: def $xmm3 killed $xmm3 killed $ymm3 +; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vmovaps %xmm7, %xmm3 +; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; CHECK-NEXT: # implicit-def: $ymm3 +; CHECK-NEXT: vmovaps %xmm6, %xmm3 +; CHECK-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] +; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3] +; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5] +; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll --- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll +++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll @@ -5,10 +5,9 @@ define <4 x i64> @autogen_SD88863() { ; CHECK-LABEL: autogen_SD88863: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %CF diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -108,12 +108,12 @@ ; X86-NEXT: vmovdqu (%eax), %ymm1 ; X86-NEXT: vpmovqw %ymm0, %xmm0 ; X86-NEXT: vpmovqw %ymm1, %xmm1 -; X86-NEXT: vpsllw $8, %xmm1, %xmm1 -; X86-NEXT: vpsraw $8, %xmm1, %xmm1 ; X86-NEXT: vpsllw $8, %xmm0, %xmm0 ; X86-NEXT: vpsraw $8, %xmm0, %xmm0 -; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; X86-NEXT: vmovupd %ymm0, (%eax) +; X86-NEXT: vpsllw $8, %xmm1, %xmm1 +; X86-NEXT: vpsraw $8, %xmm1, %xmm1 +; X86-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; X86-NEXT: vmovdqu %ymm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -126,8 +126,9 @@ ; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: vpsllw $8, %ymm0, %ymm0 ; X64-NEXT: vpsraw $8, %ymm0, %ymm0 -; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X64-NEXT: vmovdqa %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,1] ; X64-NEXT: vmovdqu %ymm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq