diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22834,6 +22834,46 @@ InnerShuf->getOperand(1), CombinedMask); } +/// Given a vectorized load used in a splat, scalarize the load to only load the +/// element required for splatting. +static SDValue scalarizeLoadIntoSplat(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + if (!Shuf->isSplat()) + return SDValue(); + + EVT VecVT = Shuf->getOperand(0).getValueType(); + SDValue SplattedOp; + if ((unsigned)Shuf->getSplatIndex() >= VecVT.getVectorNumElements()) + SplattedOp = Shuf->getOperand(0); + else + SplattedOp = Shuf->getOperand(1); + + LoadSDNode *Load = dyn_cast(Shuf->getOperand(0).getNode()); + if (!Load) + return SDValue(); + + if (!(Load->isSimple() && Load->hasOneUse() && VecVT.isVector())) + return SDValue(); + + auto &TLI = DAG.getTargetLoweringInfo(); + SDValue SplatIdx = + DAG.getConstant(Shuf->getSplatIndex(), SDLoc(Shuf), MVT::i32); + SDValue NewPtr = + TLI.getVectorElementPointer(DAG, Load->getBasePtr(), VecVT, SplatIdx); + + EVT VecEltVT = VecVT.getVectorElementType(); + unsigned PtrOff = VecEltVT.getSizeInBits() * Shuf->getSplatIndex() / 8; + MachinePointerInfo MPI = Load->getPointerInfo().getWithOffset(PtrOff); + Align Alignment = commonAlignment(Load->getAlign(), PtrOff); + + auto NewLoad = DAG.getLoad(VecEltVT, SDLoc(Load), Load->getChain(), NewPtr, + MPI, Alignment, Load->getMemOperand()->getFlags(), + Load->getAAInfo()); + DAG.makeEquivalentMemoryOrdering(Load, NewLoad); + + return DAG.getSplatBuildVector(Shuf->getValueType(0), SDLoc(Shuf), NewLoad); +} + /// If the shuffle mask is taking exactly one element from the first vector /// operand and passing through all other elements from the second vector /// operand, return the index of the mask element that is choosing an element @@ -22989,6 +23029,9 @@ if (SDValue V = formSplatFromShuffles(SVN, DAG)) return V; + if (SDValue V = scalarizeLoadIntoSplat(SVN, DAG)) + return V; + // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll --- a/llvm/test/CodeGen/AArch64/arm64-dup.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll @@ -202,8 +202,8 @@ define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { ; CHECK-LABEL: vduplane8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: dup.8b v0, v0[1] +; CHECK-NEXT: add x8, x0, #1 +; CHECK-NEXT: ld1r.8b { v0 }, [x8] ; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > @@ -213,8 +213,8 @@ define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { ; CHECK-LABEL: vduplane16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: dup.4h v0, v0[1] +; CHECK-NEXT: add x8, x0, #2 +; CHECK-NEXT: ld1r.4h { v0 }, [x8] ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > @@ -224,8 +224,8 @@ define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { ; CHECK-LABEL: vduplane32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: dup.2s v0, v0[1] +; CHECK-NEXT: add x8, x0, #4 +; CHECK-NEXT: ld1r.2s { v0 }, [x8] ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > @@ -235,8 +235,8 @@ define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { ; CHECK-LABEL: vduplanefloat: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: dup.2s v0, v0[1] +; CHECK-NEXT: add x8, x0, #4 +; CHECK-NEXT: ld1r.2s { v0 }, [x8] ; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1084,9 +1084,10 @@ define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: mul_4h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: mul.4h v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: mul.4h v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1098,9 +1099,10 @@ define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; CHECK-LABEL: mul_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: mul.8h v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ld1r.8h { v0 }, [x8] +; CHECK-NEXT: mul.8h v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -1112,9 +1114,10 @@ define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: mul_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: mul.2s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.2s { v0 }, [x8] +; CHECK-NEXT: mul.2s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1126,9 +1129,10 @@ define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { ; CHECK-LABEL: mul_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: mul.4s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ld1r.4s { v0 }, [x8] +; CHECK-NEXT: mul.4s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B @@ -1157,8 +1161,8 @@ ; CHECK-LABEL: fmul_lane_2s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: fmul.2s v0, v0, v1[1] +; CHECK-NEXT: ldr s1, [x1, #4] +; CHECK-NEXT: fmul.2s v0, v0, v1[0] ; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B @@ -1171,8 +1175,8 @@ ; CHECK-LABEL: fmul_lane_4s: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fmul.4s v0, v0, v1[1] +; CHECK-NEXT: ldr s1, [x1, #4] +; CHECK-NEXT: fmul.4s v0, v0, v1[0] ; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B @@ -1185,8 +1189,8 @@ ; CHECK-LABEL: fmul_lane_2d: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fmul.2d v0, v0, v1[1] +; CHECK-NEXT: ldr d1, [x1, #8] +; CHECK-NEXT: fmul.2d v0, v0, v1[0] ; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B @@ -1220,9 +1224,10 @@ define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind { ; CHECK-LABEL: fmulx_lane_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: fmulx.2s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.2s { v0 }, [x8] +; CHECK-NEXT: fmulx.2s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B @@ -1234,9 +1239,10 @@ define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind { ; CHECK-LABEL: fmulx_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fmulx.4s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ld1r.4s { v0 }, [x8] +; CHECK-NEXT: fmulx.4s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B @@ -1248,9 +1254,10 @@ define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind { ; CHECK-LABEL: fmulx_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: fmulx.2d v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #8 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ld1r.2d { v0 }, [x8] +; CHECK-NEXT: fmulx.2d v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B @@ -1262,9 +1269,10 @@ define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_4h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmulh.4h v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: sqdmulh.4h v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1276,9 +1284,10 @@ define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqdmulh.8h v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ld1r.8h { v0 }, [x8] +; CHECK-NEXT: sqdmulh.8h v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -1290,9 +1299,10 @@ define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmulh.2s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.2s { v0 }, [x8] +; CHECK-NEXT: sqdmulh.2s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1304,9 +1314,10 @@ define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqdmulh.4s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ld1r.4s { v0 }, [x8] +; CHECK-NEXT: sqdmulh.4s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B @@ -1330,9 +1341,10 @@ define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_4h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqrdmulh.4h v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: sqrdmulh.4h v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1344,9 +1356,10 @@ define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrdmulh.8h v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ld1r.8h { v0 }, [x8] +; CHECK-NEXT: sqrdmulh.8h v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -1358,9 +1371,10 @@ define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqrdmulh.2s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.2s { v0 }, [x8] +; CHECK-NEXT: sqrdmulh.2s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1372,9 +1386,10 @@ define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sqrdmulh.4s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ld1r.4s { v0 }, [x8] +; CHECK-NEXT: sqrdmulh.4s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B @@ -1398,9 +1413,10 @@ define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: sqdmull_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmull.4s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: sqdmull.4s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1412,9 +1428,10 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: sqdmull_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmull.2d v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.2s { v0 }, [x8] +; CHECK-NEXT: sqdmull.2d v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1426,9 +1443,10 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; CHECK-LABEL: sqdmull2_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmull.4s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: sqdmull.4s v0, v1, v0 ; CHECK-NEXT: ret %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B @@ -1441,9 +1459,10 @@ define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ; CHECK-LABEL: sqdmull2_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmull.2d v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ld1r.2s { v0 }, [x8] +; CHECK-NEXT: sqdmull.2d v0, v1, v0 ; CHECK-NEXT: ret %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B @@ -1456,9 +1475,10 @@ define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: umull_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umull.4s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: umull.4s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1470,9 +1490,10 @@ define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: umull_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: umull.2d v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.2s { v0 }, [x8] +; CHECK-NEXT: umull.2d v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1484,9 +1505,10 @@ define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: smull_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: smull.4s v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: smull.4s v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1498,9 +1520,10 @@ define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: smull_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: smull.2d v0, v0, v1[1] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ld1r.2s { v0 }, [x8] +; CHECK-NEXT: smull.2d v0, v1, v0 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1512,10 +1535,11 @@ define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: smlal_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.4s v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.4h { v1 }, [x8] +; CHECK-NEXT: smlal.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1529,10 +1553,11 @@ define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: smlal_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.2d v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.2s { v1 }, [x8] +; CHECK-NEXT: smlal.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1546,10 +1571,11 @@ define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: sqdmlal_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.4s v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.4h { v1 }, [x8] +; CHECK-NEXT: sqdmlal.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1563,10 +1589,11 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: sqdmlal_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.2d v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.2s { v1 }, [x8] +; CHECK-NEXT: sqdmlal.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1580,10 +1607,11 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: sqdmlal2_lane_4s: ; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: sqdmlal.4s v0, v1, v2[1] +; CHECK-NEXT: ldr d2, [x0, #8] +; CHECK-NEXT: ld1r.4h { v1 }, [x8] +; CHECK-NEXT: sqdmlal.4s v0, v2, v1 ; CHECK-NEXT: ret %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B @@ -1598,10 +1626,11 @@ define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: sqdmlal2_lane_2d: ; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1] +; CHECK-NEXT: ldr d2, [x0, #8] +; CHECK-NEXT: ld1r.2s { v1 }, [x8] +; CHECK-NEXT: sqdmlal.2d v0, v2, v1 ; CHECK-NEXT: ret %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B @@ -1718,10 +1747,11 @@ define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: umlal_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.4s v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.4h { v1 }, [x8] +; CHECK-NEXT: umlal.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1735,10 +1765,11 @@ define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: umlal_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.2d v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.2s { v1 }, [x8] +; CHECK-NEXT: umlal.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1753,10 +1784,11 @@ define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: smlsl_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.4s v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.4h { v1 }, [x8] +; CHECK-NEXT: smlsl.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1770,10 +1802,11 @@ define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: smlsl_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.2d v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.2s { v1 }, [x8] +; CHECK-NEXT: smlsl.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1787,10 +1820,11 @@ define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: sqdmlsl_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.4s v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.4h { v1 }, [x8] +; CHECK-NEXT: sqdmlsl.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1804,10 +1838,11 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: sqdmlsl_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.2d v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.2s { v1 }, [x8] +; CHECK-NEXT: sqdmlsl.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1821,10 +1856,11 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: sqdmlsl2_lane_4s: ; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[1] +; CHECK-NEXT: ldr d2, [x0, #8] +; CHECK-NEXT: ld1r.4h { v1 }, [x8] +; CHECK-NEXT: sqdmlsl.4s v0, v2, v1 ; CHECK-NEXT: ret %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B @@ -1839,10 +1875,11 @@ define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: sqdmlsl2_lane_2d: ; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1] +; CHECK-NEXT: ldr d2, [x0, #8] +; CHECK-NEXT: ld1r.2s { v1 }, [x8] +; CHECK-NEXT: sqdmlsl.2d v0, v2, v1 ; CHECK-NEXT: ret %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B @@ -1857,10 +1894,11 @@ define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: umlsl_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.4s v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.4h { v1 }, [x8] +; CHECK-NEXT: umlsl.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1874,10 +1912,11 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: umlsl_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.2d v0, v2, v1[1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: ld1r.2s { v1 }, [x8] +; CHECK-NEXT: umlsl.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -716,8 +716,7 @@ ; CHECK-LABEL: load_splat_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret %v = load <8 x float>, ptr %p @@ -729,8 +728,7 @@ ; CHECK-LABEL: load_splat_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: ret %v = load <4 x double>, ptr %p @@ -742,8 +740,7 @@ ; CHECK-LABEL: load_splat_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: mov z0.b, b0 +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] ; CHECK-NEXT: st1b { z0.b }, p0, [x8] ; CHECK-NEXT: ret %v = load <32 x i8>, ptr %p @@ -755,8 +752,7 @@ ; CHECK-LABEL: load_splat_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: st1h { z0.h }, p0, [x8] ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %p @@ -768,8 +764,7 @@ ; CHECK-LABEL: load_splat_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret %v = load <8 x i32>, ptr %p @@ -781,8 +776,7 @@ ; CHECK-LABEL: load_splat_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: ret %v = load <4 x i64>, ptr %p diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -135,24 +135,28 @@ ; GFX9-LABEL: shuffle_v4f16_u3uu: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:6 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_u3uu: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:6 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_u3uu: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:6 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1168,7 +1172,7 @@ ; GFX9-LABEL: shuffle_v4f16_0000: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 @@ -1179,7 +1183,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -1189,7 +1193,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1902,31 +1906,33 @@ ; GFX9-LABEL: hi16bits: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v5, v[2:3], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: hi16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: global_load_dword v2, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 +; GFX10-NEXT: global_load_short_d16 v2, v[0:1], off offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: hi16bits: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 @@ -1978,30 +1984,31 @@ ; GFX9-LABEL: hi16low16bits: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_ushort v4, v[0:1], off offset:2 ; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: hi16low16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_ushort v4, v[0:1], off offset:2 ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: hi16low16bits: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 @@ -2091,30 +2098,31 @@ ; GFX9-LABEL: i16_hi16low16bits: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_ushort v4, v[0:1], off offset:2 ; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: i16_hi16low16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_ushort v4, v[0:1], off offset:2 ; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: i16_hi16low16bits: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 @@ -2128,31 +2136,33 @@ ; GFX9-LABEL: i16_hi16bits: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v5, v[2:3], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: i16_hi16bits: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: global_load_dword v2, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 +; GFX10-NEXT: global_load_short_d16 v2, v[0:1], off offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: i16_hi16bits: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll --- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll +++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll @@ -8,9 +8,10 @@ define i32 @foo(%struct.desc* %descs, i32 %num, i32 %cw) local_unnamed_addr #0 { ; CHECK-LABEL: foo: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr d16, [r0, #32] +; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vld1.32 {d16[]}, [r0:32] ; CHECK-NEXT: vadd.i32 d16, d16, d16 -; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: vmov.32 r0, d16[1] ; CHECK-NEXT: bx lr entry: %descs.vec = bitcast %struct.desc* %descs to <16 x i32>* diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -222,8 +222,8 @@ define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { ; CHECK-LABEL: vduplane8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.8 d16, d16[1] +; CHECK-NEXT: add r0, r0, #1 +; CHECK-NEXT: vld1.8 {d16[]}, [r0] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A @@ -234,8 +234,8 @@ define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { ; CHECK-LABEL: vduplane16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.16 d16, d16[1] +; CHECK-NEXT: add r0, r0, #2 +; CHECK-NEXT: vld1.16 {d16[]}, [r0:16] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A @@ -246,8 +246,8 @@ define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { ; CHECK-LABEL: vduplane32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.32 d16, d16[1] +; CHECK-NEXT: add r0, r0, #4 +; CHECK-NEXT: vld1.32 {d16[]}, [r0:32] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A @@ -258,8 +258,8 @@ define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { ; CHECK-LABEL: vduplanefloat: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.32 d16, d16[1] +; CHECK-NEXT: add r0, r0, #4 +; CHECK-NEXT: vld1.32 {d16[]}, [r0:32] ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x float>, <2 x float>* %A diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -1127,16 +1127,11 @@ ; ; CHECK-NOVSX-LABEL: testSplati64_1: ; CHECK-NOVSX: # %bb.0: # %entry -; CHECK-NOVSX-NEXT: ld r4, 8(r3) -; CHECK-NOVSX-NEXT: std r4, -8(r1) -; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI21_0@toc@ha -; CHECK-NOVSX-NEXT: ld r3, 0(r3) -; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI21_0@toc@l -; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: ld r3, 8(r3) +; CHECK-NOVSX-NEXT: addi r4, r1, -16 +; CHECK-NOVSX-NEXT: std r3, -8(r1) ; CHECK-NOVSX-NEXT: std r3, -16(r1) -; CHECK-NOVSX-NEXT: addi r3, r1, -16 -; CHECK-NOVSX-NEXT: lvx v3, 0, r3 -; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2 +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 ; CHECK-NOVSX-NEXT: blr ; ; CHECK-P7-LABEL: testSplati64_1: @@ -1145,11 +1140,26 @@ ; CHECK-P7-NEXT: lxvdsx v2, 0, r3 ; CHECK-P7-NEXT: blr ; -; P8-AIX-LABEL: testSplati64_1: -; P8-AIX: # %bb.0: # %entry -; P8-AIX-NEXT: addi r3, r3, 8 -; P8-AIX-NEXT: lxvdsx v2, 0, r3 -; P8-AIX-NEXT: blr +; P8-AIX-64-LABEL: testSplati64_1: +; P8-AIX-64: # %bb.0: # %entry +; P8-AIX-64-NEXT: addi r3, r3, 8 +; P8-AIX-64-NEXT: lxvdsx v2, 0, r3 +; P8-AIX-64-NEXT: blr +; +; P8-AIX-32-LABEL: testSplati64_1: +; P8-AIX-32: # %bb.0: # %entry +; P8-AIX-32-NEXT: lwz r4, L..C4(r2) # %const.0 +; P8-AIX-32-NEXT: lwz r5, 12(r3) +; P8-AIX-32-NEXT: lwz r3, 8(r3) +; P8-AIX-32-NEXT: stw r5, -16(r1) +; P8-AIX-32-NEXT: stw r3, -32(r1) +; P8-AIX-32-NEXT: addi r3, r1, -16 +; P8-AIX-32-NEXT: lxvw4x v2, 0, r4 +; P8-AIX-32-NEXT: addi r4, r1, -32 +; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 +; P8-AIX-32-NEXT: lxvw4x v4, 0, r4 +; P8-AIX-32-NEXT: vperm v2, v4, v3, v2 +; P8-AIX-32-NEXT: blr entry: %0 = load <2 x i64>, ptr %ptr, align 8 %1 = shufflevector <2 x i64> %0, <2 x i64> undef, <2 x i32> diff --git a/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-vectorized-load-splat.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s + +; Ensures that vectorized loads that are really just splatted loads, are indeed +; selected as splatted loads + +target triple = "wasm32-unknown-unknown" + +define <4 x i32> @load_splat_shuhffle_lhs(ptr %p) { +; CHECK-LABEL: load_splat_shuhffle_lhs: +; CHECK: .functype load_splat_shuhffle_lhs (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @load_splat_shuffle_lhs_with_offset(ptr %p) { +; CHECK-LABEL: load_splat_shuffle_lhs_with_offset: +; CHECK: .functype load_splat_shuffle_lhs_with_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @load_splat_shuffle_rhs(ptr %p) { +; CHECK-LABEL: load_splat_shuffle_rhs: +; CHECK: .functype load_splat_shuffle_rhs (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} + +define <4 x i32> @load_splat_shuffle_rhs_with_offset(ptr %p) { +; CHECK-LABEL: load_splat_shuffle_rhs_with_offset: +; CHECK: .functype load_splat_shuffle_rhs_with_offset (i32) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: v128.load64_splat 0 +; CHECK-NEXT: # fallthrough-return + %a = load <2 x i64>, ptr %p + %b = shufflevector <2 x i64> poison, <2 x i64> %a, <2 x i32> + %c = bitcast <2 x i64> %b to <4 x i32> + %d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> + ret <4 x i32> %d +} diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -370,12 +370,12 @@ ; X86-LABEL: load_splat_4i32_4i32_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; X86-NEXT: vbroadcastss 4(%eax), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4i32_4i32_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; X64-NEXT: vbroadcastss 4(%rdi), %xmm0 ; X64-NEXT: retq entry: %ld = load <4 x i32>, ptr %ptr @@ -472,12 +472,12 @@ ; X86-LABEL: load_splat_2i64_2i64_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: retl ; ; X64-LABEL: load_splat_2i64_2i64_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %ld = load <2 x i64>, ptr %ptr diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -84,7 +84,7 @@ ; X32-SSE2-LABEL: t4: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movdqa (%eax), %xmm0 +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-SSE2-NEXT: movd %xmm0, %eax ; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X32-SSE2-NEXT: movd %xmm0, %edx diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1332,23 +1332,24 @@ define <8 x half> @shuffle(ptr %p) { ; CHECK-LIBCALL-LABEL: shuffle: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movdqu (%rdi), %xmm0 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-LIBCALL-NEXT: pinsrw $0, 8(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: shuffle: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,4,4,4,4] -; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; BWON-F16C-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: shuffle: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movdqu (%eax), %xmm0 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-I686-NEXT: pinsrw $0, 8(%eax), %xmm0 +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-I686-NEXT: retl %1 = load <8 x half>, ptr %p, align 8 %2 = shufflevector <8 x half> %1, <8 x half> poison, <8 x i32> diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -395,14 +395,14 @@ define <4 x i32> @t17() nounwind { ; X86-LABEL: t17: ; X86: # %bb.0: # %entry -; X86-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X86-NEXT: retl ; ; X64-LABEL: t17: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X64-NEXT: retq entry: %tmp1 = load <4 x float>, ptr undef, align 16 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -1587,9 +1587,8 @@ ; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08] -; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-SSE-NEXT: insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: @@ -1608,9 +1607,8 @@ ; ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f] -; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-SSE-NEXT: insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1222,7 +1222,7 @@ define <2 x double> @insert_dup_mem128_v2f64(ptr %ptr) nounwind { ; SSE2-LABEL: insert_dup_mem128_v2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll --- a/llvm/test/CodeGen/X86/widened-broadcast.ll +++ b/llvm/test/CodeGen/X86/widened-broadcast.ll @@ -468,7 +468,7 @@ define <4 x float> @load_splat_4f32_8f32_0000(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_4f32_8f32_0000: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ;