Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14555,6 +14555,52 @@ return DAG.getBitcast(VT, NarrowBinOp); } +/// If we are extracting a subvector from a wide vector load, convert to a +/// narrow load to eliminate the extraction: +/// (extract_subvector (load wide vector)) --> (load narrow vector) +static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { + // TODO: Add support for big-endian. The offset calculation must be adjusted. + if (DAG.getDataLayout().isBigEndian()) + return SDValue(); + + // TODO: The one-use check is overly conservative. Check the cost of the + // extract instead or remove that condition entirely. + auto *Ld = dyn_cast(Extract->getOperand(0)); + auto *ExtIdx = dyn_cast(Extract->getOperand(1)); + if (!Ld || !Ld->hasOneUse() || Ld->isVolatile() || !ExtIdx) + return SDValue(); + + // The narrow load will be offset from the base address of the old load if + // we are extracting from something besides index 0 (little-endian). + EVT VT = Extract->getValueType(0); + SDLoc DL(Extract); + SDValue BaseAddr = Ld->getOperand(1); + unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); + + // TODO: Use "BaseIndexOffset" to make this more effective. + SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, + VT.getStoreSize()); + SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); + + // The new load must have the same position as the old load in terms of memory + // dependency. Create a TokenFactor for Ld and NewLd and update uses of Ld's + // output chain to use that TokenFactor. + // TODO: This code is based on a similar sequence in x86 lowering. It should + // be moved to a helper function, so it can be shared and reused. + if (Ld->hasAnyUseOfValue(1)) { + SDValue OldChain = SDValue(Ld, 1); + SDValue NewChain = SDValue(NewLd.getNode(), 1); + SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + OldChain, NewChain); + DAG.ReplaceAllUsesOfValueWith(OldChain, TokenFactor); + DAG.UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain); + } + + return NewLd; +} + SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); @@ -14563,6 +14609,10 @@ if (V.isUndef()) return DAG.getUNDEF(NVT); + if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT)) + if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) + return NarrowLoad; + // Combine: // (extract_subvec (concat V1, V2, ...), i) // Into: Index: llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-vabs.ll @@ -33,7 +33,7 @@ define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: sabdl2_8h: -;CHECK: sabdl2.8h +;CHECK: sabdl.8h %load1 = load <16 x i8>, <16 x i8>* %A %load2 = load <16 x i8>, <16 x i8>* %B %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> @@ -45,7 +45,7 @@ define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: sabdl2_4s: -;CHECK: sabdl2.4s +;CHECK: sabdl.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -57,7 +57,7 @@ define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: sabdl2_2d: -;CHECK: sabdl2.2d +;CHECK: sabdl.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -99,7 +99,7 @@ define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: uabdl2_8h: -;CHECK: uabdl2.8h +;CHECK: uabdl.8h %load1 = load <16 x i8>, <16 x i8>* %A %load2 = load <16 x i8>, <16 x i8>* %B %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> @@ -112,7 +112,7 @@ define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: uabdl2_4s: -;CHECK: uabdl2.4s +;CHECK: uabdl.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -124,7 +124,7 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: uabdl2_2d: -;CHECK: uabdl2.2d +;CHECK: uabdl.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -561,7 +561,7 @@ define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { ;CHECK-LABEL: sabal2_8h: -;CHECK: sabal2.8h +;CHECK: sabal.8h %load1 = load <16 x i8>, <16 x i8>* %A %load2 = load <16 x i8>, <16 x i8>* %B %tmp3 = load <8 x i16>, <8 x i16>* %C @@ -575,7 +575,7 @@ define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sabal2_4s: -;CHECK: sabal2.4s +;CHECK: sabal.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -589,7 +589,7 @@ define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sabal2_2d: -;CHECK: sabal2.2d +;CHECK: sabal.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -639,7 +639,7 @@ define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { ;CHECK-LABEL: uabal2_8h: -;CHECK: uabal2.8h +;CHECK: uabal.8h %load1 = load <16 x i8>, <16 x i8>* %A %load2 = load <16 x i8>, <16 x i8>* %B %tmp3 = load <8 x i16>, <8 x i16>* %C @@ -653,7 +653,7 @@ define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: uabal2_4s: -;CHECK: uabal2.4s +;CHECK: uabal.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -667,7 +667,7 @@ define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: uabal2_2d: -;CHECK: uabal2.2d +;CHECK: uabal.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C Index: llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-vadd.ll @@ -318,7 +318,7 @@ define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: uaddw2_8h: -;CHECK: uaddw2.8h +;CHECK: uaddw.8h %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -331,7 +331,7 @@ define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: uaddw2_4s: -;CHECK: uaddw2.4s +;CHECK: uaddw.4s %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -344,7 +344,7 @@ define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: uaddw2_2d: -;CHECK: uaddw2.2d +;CHECK: uaddw.2d %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B @@ -387,7 +387,7 @@ define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: saddw2_8h: -;CHECK: saddw2.8h +;CHECK: saddw.8h %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -400,7 +400,7 @@ define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: saddw2_4s: -;CHECK: saddw2.4s +;CHECK: saddw.4s %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -413,7 +413,7 @@ define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: saddw2_2d: -;CHECK: saddw2.2d +;CHECK: saddw.2d %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B Index: llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-vmul.ll @@ -83,7 +83,7 @@ define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: sqdmull2_4s: -;CHECK: sqdmull2.4s +;CHECK: sqdmull.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -94,7 +94,7 @@ define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: sqdmull2_2d: -;CHECK: sqdmull2.2d +;CHECK: sqdmull.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -324,7 +324,7 @@ define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sqdmlal2_4s: -;CHECK: sqdmlal2.4s +;CHECK: sqdmlal.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -337,7 +337,7 @@ define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sqdmlal2_2d: -;CHECK: sqdmlal2.2d +;CHECK: sqdmlal.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -372,7 +372,7 @@ define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sqdmlsl2_4s: -;CHECK: sqdmlsl2.4s +;CHECK: sqdmlsl.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -385,7 +385,7 @@ define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sqdmlsl2_2d: -;CHECK: sqdmlsl2.2d +;CHECK: sqdmlsl.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -874,7 +874,7 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: sqdmull2_lane_4s: ;CHECK-NOT: dup -;CHECK: sqdmull2.4s +;CHECK: sqdmull.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -886,7 +886,7 @@ define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: sqdmull2_lane_2d: ;CHECK-NOT: dup -;CHECK: sqdmull2.2d +;CHECK: sqdmull.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -994,7 +994,7 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sqdmlal2_lane_4s: ;CHECK-NOT: dup -;CHECK: sqdmlal2.4s +;CHECK: sqdmlal.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1008,7 +1008,7 @@ define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sqdmlal2_lane_2d: ;CHECK-NOT: dup -;CHECK: sqdmlal2.2d +;CHECK: sqdmlal.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1147,7 +1147,7 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sqdmlsl2_lane_4s: ;CHECK-NOT: dup -;CHECK: sqdmlsl2.4s +;CHECK: sqdmlsl.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1161,7 +1161,7 @@ define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sqdmlsl2_lane_2d: ;CHECK-NOT: dup -;CHECK: sqdmlsl2.2d +;CHECK: sqdmlsl.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C Index: llvm/trunk/test/CodeGen/AArch64/arm64-vshift.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-vshift.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-vshift.ll @@ -1164,7 +1164,7 @@ define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind { ;CHECK-LABEL: ushll2_8h: -;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1 +;CHECK: ushll.8h v0, {{v[0-9]+}}, #1 %load1 = load <16 x i8>, <16 x i8>* %A %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> %tmp2 = zext <8 x i8> %tmp1 to <8 x i16> @@ -1174,7 +1174,7 @@ define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind { ;CHECK-LABEL: ushll2_4s: -;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1 +;CHECK: ushll.4s v0, {{v[0-9]+}}, #1 %load1 = load <8 x i16>, <8 x i16>* %A %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> %tmp2 = zext <4 x i16> %tmp1 to <4 x i32> @@ -1184,7 +1184,7 @@ define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind { ;CHECK-LABEL: ushll2_2d: -;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1 +;CHECK: ushll.2d v0, {{v[0-9]+}}, #1 %load1 = load <4 x i32>, <4 x i32>* %A %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> %tmp2 = zext <2 x i32> %tmp1 to <2 x i64> @@ -1221,7 +1221,7 @@ define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind { ;CHECK-LABEL: sshll2_8h: -;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1 +;CHECK: sshll.8h v0, {{v[0-9]+}}, #1 %load1 = load <16 x i8>, <16 x i8>* %A %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> %tmp2 = sext <8 x i8> %tmp1 to <8 x i16> @@ -1231,7 +1231,7 @@ define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind { ;CHECK-LABEL: sshll2_4s: -;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1 +;CHECK: sshll.4s v0, {{v[0-9]+}}, #1 %load1 = load <8 x i16>, <8 x i16>* %A %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> %tmp2 = sext <4 x i16> %tmp1 to <4 x i32> @@ -1241,7 +1241,7 @@ define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind { ;CHECK-LABEL: sshll2_2d: -;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1 +;CHECK: sshll.2d v0, {{v[0-9]+}}, #1 %load1 = load <4 x i32>, <4 x i32>* %A %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> %tmp2 = sext <2 x i32> %tmp1 to <2 x i64> Index: llvm/trunk/test/CodeGen/AArch64/arm64-vsub.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-vsub.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-vsub.ll @@ -157,7 +157,7 @@ define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: ssubl2_8h: -;CHECK: ssubl2.8h +;CHECK: ssubl.8h %tmp1 = load <16 x i8>, <16 x i8>* %A %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> %ext1 = sext <8 x i8> %high1 to <8 x i16> @@ -172,7 +172,7 @@ define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: ssubl2_4s: -;CHECK: ssubl2.4s +;CHECK: ssubl.4s %tmp1 = load <8 x i16>, <8 x i16>* %A %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> %ext1 = sext <4 x i16> %high1 to <4 x i32> @@ -187,7 +187,7 @@ define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: ssubl2_2d: -;CHECK: ssubl2.2d +;CHECK: ssubl.2d %tmp1 = load <4 x i32>, <4 x i32>* %A %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> %ext1 = sext <2 x i32> %high1 to <2 x i64> @@ -235,7 +235,7 @@ define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: usubl2_8h: -;CHECK: usubl2.8h +;CHECK: usubl.8h %tmp1 = load <16 x i8>, <16 x i8>* %A %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> %ext1 = zext <8 x i8> %high1 to <8 x i16> @@ -250,7 +250,7 @@ define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: usubl2_4s: -;CHECK: usubl2.4s +;CHECK: usubl.4s %tmp1 = load <8 x i16>, <8 x i16>* %A %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> %ext1 = zext <4 x i16> %high1 to <4 x i32> @@ -265,7 +265,7 @@ define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: usubl2_2d: -;CHECK: usubl2.2d +;CHECK: usubl.2d %tmp1 = load <4 x i32>, <4 x i32>* %A %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> %ext1 = zext <2 x i32> %high1 to <2 x i64> @@ -310,7 +310,7 @@ define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: ssubw2_8h: -;CHECK: ssubw2.8h +;CHECK: ssubw.8h %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -323,7 +323,7 @@ define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: ssubw2_4s: -;CHECK: ssubw2.4s +;CHECK: ssubw.4s %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -336,7 +336,7 @@ define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: ssubw2_2d: -;CHECK: ssubw2.2d +;CHECK: ssubw.2d %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B @@ -379,7 +379,7 @@ define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: usubw2_8h: -;CHECK: usubw2.8h +;CHECK: usubw.8h %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -392,7 +392,7 @@ define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: usubw2_4s: -;CHECK: usubw2.4s +;CHECK: usubw.4s %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -405,7 +405,7 @@ define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: usubw2_2d: -;CHECK: usubw2.2d +;CHECK: usubw.2d %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B Index: llvm/trunk/test/CodeGen/ARM/vcombine.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vcombine.ll +++ llvm/trunk/test/CodeGen/ARM/vcombine.ll @@ -99,7 +99,9 @@ define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind { ; CHECK: vget_high8 ; CHECK-NOT: vst -; CHECK-LE: vmov r0, r1, d17 +; CHECK-LE-NOT: vld1.64 {d16, d17}, [r0] +; CHECK-LE: vldr d16, [r0, #8] +; CHECK-LE: vmov r0, r1, d16 ; CHECK-BE: vmov r1, r0, d16 %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> Index: llvm/trunk/test/CodeGen/ARM/vext.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vext.ll +++ llvm/trunk/test/CodeGen/ARM/vext.ll @@ -199,10 +199,10 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; CHECK-LABEL: test_undef: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.64 {d18, d19}, [r0] -; CHECK-NEXT: vzip.16 d19, d16 -; CHECK-NEXT: vmov r0, r1, d19 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0, #8] +; CHECK-NEXT: vzip.16 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2216,9 +2216,9 @@ ; ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckwd %k1, %k0, %k0 +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0 ; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: retl %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -282,8 +282,7 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) { ; ALL-LABEL: shuffle_v16f32_extract_256: ; ALL: # BB#0: -; ALL-NEXT: vmovups (%rsi), %zmm0 -; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm0 +; ALL-NEXT: vmovups 32(%rsi), %ymm0 ; ALL-NEXT: retq %ptr_a = bitcast float* %a to <16 x float>* %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4 Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -511,11 +511,10 @@ ; ; KNL64-LABEL: expand14: ; KNL64: # BB#0: +; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0] +; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL64-NEXT: retq ; @@ -529,11 +528,10 @@ ; ; KNL32-LABEL: expand14: ; KNL32: # BB#0: +; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0] +; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL32-NEXT: retl %addV = fadd <4 x float> , @@ -545,39 +543,35 @@ define <8 x float> @expand15(<4 x float> %a) { ; SKX64-LABEL: expand15: ; SKX64: # BB#0: -; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u> -; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0] +; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] -; SKX64-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 +; SKX64-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 ; SKX64-NEXT: retq ; ; KNL64-LABEL: expand15: ; KNL64: # BB#0: +; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL64-NEXT: retq ; ; SKX32-LABEL: expand15: ; SKX32: # BB#0: -; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u> -; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0] +; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] -; SKX32-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 +; SKX32-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 ; SKX32-NEXT: retl ; ; KNL32-LABEL: expand15: ; KNL32: # BB#0: +; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL32-NEXT: retl %addV = fadd <4 x float> , Index: llvm/trunk/test/CodeGen/X86/widened-broadcast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widened-broadcast.ll +++ llvm/trunk/test/CodeGen/X86/widened-broadcast.ll @@ -151,8 +151,7 @@ ; ; AVX1-LABEL: load_splat_8i32_8i32_01010101: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovapd (%rdi), %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -288,8 +287,7 @@ ; ; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -315,22 +313,10 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_16i16_16i16_0123012301230123: +; AVX: # BB#0: # %entry +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq entry: %ld = load <16 x i16>, <16 x i16>* %ptr %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> @@ -513,8 +499,7 @@ ; ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -587,26 +572,10 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_4f32_8f32_0000: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_4f32_8f32_0000: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_4f32_8f32_0000: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_4f32_8f32_0000: +; AVX: # BB#0: # %entry +; AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX-NEXT: retq entry: %ld = load <8 x float>, <8 x float>* %ptr %ret = shufflevector <8 x float> %ld, <8 x float> undef, <4 x i32> zeroinitializer @@ -627,22 +596,10 @@ ; SSE42-NEXT: movapd %xmm0, %xmm1 ; SSE42-NEXT: retq ; -; AVX1-LABEL: load_splat_8f32_16f32_89898989: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vbroadcastsd 32(%rdi), %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_8f32_16f32_89898989: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vbroadcastsd 32(%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_8f32_16f32_89898989: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovapd (%rdi), %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_8f32_16f32_89898989: +; AVX: # BB#0: # %entry +; AVX-NEXT: vbroadcastsd 32(%rdi), %ymm0 +; AVX-NEXT: retq entry: %ld = load <16 x float>, <16 x float>* %ptr %ret = shufflevector <16 x float> %ld, <16 x float> undef, <8 x i32> Index: llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll +++ llvm/trunk/test/CodeGen/X86/x86-interleaved-access.ll @@ -57,10 +57,8 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vmovups (%rdi), %ymm0 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; AVX1-NEXT: vmovups 64(%rdi), %ymm2 -; AVX1-NEXT: vmovups 96(%rdi), %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -69,10 +67,8 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vmovupd (%rdi), %ymm0 ; AVX2-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX2-NEXT: vmovupd 64(%rdi), %ymm2 -; AVX2-NEXT: vmovupd 96(%rdi), %ymm3 -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: retq