Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -794,6 +794,9 @@ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + + setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2f32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -971,11 +974,9 @@ // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. - setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i16, Custom); setOperationAction(ISD::LOAD, MVT::v8i8, Custom); - setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i16, Custom); setOperationAction(ISD::STORE, MVT::v8i8, Custom); @@ -21268,21 +21269,29 @@ TargetLowering::TypeWidenVector) return SDValue(); - // Widen the vector, cast to a v2x64 type, extract the single 64-bit element - // and store it. MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), StoreVT.getVectorNumElements() * 2); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); - MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; - MVT CastVT = MVT::getVectorVT(StVT, 2); - StoredVal = DAG.getBitcast(CastVT, StoredVal); - StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, - DAG.getIntPtrConstant(0, dl)); - return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); + if (Subtarget.hasSSE2()) { + // Widen the vector, cast to a v2x64 type, extract the single 64-bit element + // and store it. + MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; + MVT CastVT = MVT::getVectorVT(StVT, 2); + StoredVal = DAG.getBitcast(CastVT, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, + DAG.getIntPtrConstant(0, dl)); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); + } + assert(Subtarget.hasSSE1() && "Expected SSE"); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; + return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, + St->getMemOperand()); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we @@ -28156,19 +28165,28 @@ if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast(N); - MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; - SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), - Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - SDValue Chain = Res.getValue(1); - MVT WideVT = MVT::getVectorVT(LdVT, 2); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); - MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() * 2); - Res = DAG.getBitcast(CastVT, Res); + if (Subtarget.hasSSE2()) { + MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; + SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + SDValue Chain = Res.getValue(1); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + Res = DAG.getBitcast(CastVT, Res); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + assert(Subtarget.hasSSE1() && "Expected SSE"); + SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other); + SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; + SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + MVT::i64, Ld->getMemOperand()); Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(1)); return; } } @@ -32017,8 +32035,11 @@ // directly if we don't shuffle the lower element and we shuffle the upper // (zero) elements within themselves. if (V1.getOpcode() == X86ISD::VZEXT_LOAD && - (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) { - unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits; + (cast(V1)->getMemoryVT().getScalarSizeInBits() % + MaskEltSizeInBits) == 0) { + unsigned Scale = + cast(V1)->getMemoryVT().getScalarSizeInBits() / + MaskEltSizeInBits; ArrayRef HiMask(Mask.data() + Scale, NumMaskElts - Scale); if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { Index: llvm/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/lib/Target/X86/X86InstrAVX512.td +++ llvm/lib/Target/X86/X86InstrAVX512.td @@ -1352,15 +1352,15 @@ let Predicates = [HasAVX512] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZm addr:$src)>; } let Predicates = [HasVLX] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ128m addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ256m addr:$src)>; } let Predicates = [HasVLX, HasBWI] in { @@ -3838,7 +3838,7 @@ (VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>; let Predicates = [HasAVX512] in { - def : Pat<(X86vextractstore (v2i64 VR128X:$src), addr:$dst), + def : Pat<(X86vextractstore64 (v2i64 VR128X:$src), addr:$dst), (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>; } @@ -3873,7 +3873,7 @@ // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar { let Predicates = [HasAVX512, OptForSize] in def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), @@ -3901,7 +3901,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set _.RC:$dst, (_.VT (X86vzload addr:$src)))], + [(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))], _.ExeDomain>, EVEX, Sched<[WriteFLoad]>; // _alt version uses FR32/FR64 register class. let isCodeGenOnly = 1 in @@ -3935,10 +3935,10 @@ NotMemoryFoldable; } -defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, +defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>, VEX_LIG, XS, EVEX_CD8<32, CD8VT1>; -defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, +defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>, VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -4319,16 +4319,16 @@ // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8f32 (X86vzload addr:$src)), + def : Pat<(v8f32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzload addr:$src)), + def : Pat<(v4f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; // Represent the same patterns above but in the form they appear for // 512-bit types - def : Pat<(v16f32 (X86vzload addr:$src)), + def : Pat<(v16f32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; - def : Pat<(v8f64 (X86vzload addr:$src)), + def : Pat<(v8f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; } @@ -4351,21 +4351,21 @@ // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), + def : Pat<(v4i32 (X86vzload32 addr:$src)), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v8i32 (X86vzload addr:$src)), + def : Pat<(v8i32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), (VMOVZPQILo2PQIZrr VR128X:$src)>; - def : Pat<(v2i64 (X86vzload addr:$src)), + def : Pat<(v2i64 (X86vzload64 addr:$src)), (VMOVQI2PQIZrm addr:$src)>; - def : Pat<(v4i64 (X86vzload addr:$src)), + def : Pat<(v4i64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. - def : Pat<(v16i32 (X86vzload addr:$src)), + def : Pat<(v16i32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; - def : Pat<(v8i64 (X86vzload addr:$src)), + def : Pat<(v8i64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), @@ -6353,11 +6353,11 @@ def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload addr:$src2))), + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))), (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; // VMOVLPD patterns - def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload addr:$src2))), + def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload64 addr:$src2))), (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; } @@ -8292,47 +8292,47 @@ } let Predicates = [HasDQI, HasVLX] in { - def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src))))), + def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTPS2QQZ128rm addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), VR128X:$src0)), (VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), v2i64x_info.ImmAllZerosV)), (VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src))))), + def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTPS2UQQZ128rm addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), VR128X:$src0)), (VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), v2i64x_info.ImmAllZerosV)), (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src))))), + def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTTPS2QQZ128rm addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), VR128X:$src0)), (VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), v2i64x_info.ImmAllZerosV)), (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src))))), + def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTTPS2UQQZ128rm addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), VR128X:$src0)), (VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), v2i64x_info.ImmAllZerosV)), (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; } @@ -8375,25 +8375,25 @@ } let Predicates = [HasVLX] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTDQ2PDZ128rm addr:$src)>; def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))), + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), VR128X:$src0)), (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))), + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), v2f64x_info.ImmAllZerosV)), (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTUDQ2PDZ128rm addr:$src)>; def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))), + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), VR128X:$src0)), (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))), + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), v2f64x_info.ImmAllZerosV)), (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; } @@ -8562,7 +8562,7 @@ EVEX_CD8<32, CD8VH>; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (VCVTPH2PSZ128rm addr:$src)>; def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), @@ -9626,13 +9626,13 @@ (!cast(OpcPrefix#BWZ128rm) addr:$src)>; def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; } let Predicates = [HasVLX] in { def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v4i32 addr:$src)))), + def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), @@ -9642,35 +9642,35 @@ (!cast(OpcPrefix#WDZ128rm) addr:$src)>; def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v4i32 addr:$src)))), + def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; } let Predicates = [HasVLX] in { def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v4i32 addr:$src)))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; } // 512-bit patterns @@ -10873,7 +10873,7 @@ (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; -def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))), +def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), Index: llvm/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -99,10 +99,10 @@ def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; -def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def X86vextractstore : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, @@ -939,10 +939,20 @@ def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>; -def vzload_v4i32 : PatFrag<(ops node:$src), - (bitconvert (v4i32 (X86vzload node:$src)))>; -def vzload_v2i64 : PatFrag<(ops node:$src), - (bitconvert (v2i64 (X86vzload node:$src)))>; +def X86vzload32 : PatFrag<(ops node:$src), + (X86vzld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 4; +}]>; + +def X86vzload64 : PatFrag<(ops node:$src), + (X86vzld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 8; +}]>; + +def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr), + (X86vextractst node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT().getStoreSize() == 8; +}]>; def fp32imm0 : PatLeaf<(f32 fpimm), [{ Index: llvm/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/lib/Target/X86/X86InstrSSE.td +++ llvm/lib/Target/X86/X86InstrSSE.td @@ -226,14 +226,15 @@ // Loading from memory automatically zeroing upper bits. multiclass sse12_move_rm { + PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, + Domain d> { def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (vt (X86vzload addr:$src)))], d>, + [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (vt (X86vzload addr:$src)))], d>, + [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, Sched<[WriteFLoad]>; // _alt version uses FR32/FR64 register class. @@ -255,9 +256,9 @@ SSEPackedDouble, "MOVSD", UseSSE2>, XD; let canFoldAsLoad = 1, isReMaterializable = 1 in { - defm MOVSS : sse12_move_rm, XS; - defm MOVSD : sse12_move_rm, XD; } @@ -270,9 +271,9 @@ // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8f32 (X86vzload addr:$src)), + def : Pat<(v8f32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; - def : Pat<(v4f64 (X86vzload addr:$src)), + def : Pat<(v4f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } @@ -663,6 +664,13 @@ def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1, (i8 -28)), (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), + (MOVLPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(v4f32 (X86vzload64 addr:$src)), + (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; + def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), + (MOVLPSmr addr:$dst, VR128:$src)>; } //===----------------------------------------------------------------------===// @@ -702,7 +710,7 @@ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload addr:$src2))), + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), (VMOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt @@ -711,7 +719,7 @@ (VMOVHPDmr addr:$dst, VR128:$src)>; // MOVLPD patterns - def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload addr:$src2))), + def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; } @@ -721,6 +729,12 @@ // No need for aligned load, we're only loading 64-bits. def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))), (MOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + + def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), + addr:$dst), + (MOVHPSmr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE2] in { @@ -731,7 +745,7 @@ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (MOVHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload addr:$src2))), + def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), (MOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt @@ -740,7 +754,7 @@ (MOVHPDmr addr:$dst, VR128:$src)>; // MOVLPD patterns - def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload addr:$src2))), + def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; } @@ -1631,13 +1645,13 @@ // AVX register conversion intrinsics let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTDQ2PDrm addr:$src)>; } // Predicates = [HasAVX, NoVLX] // SSE2 register conversion intrinsics let Predicates = [UseSSE2] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (CVTDQ2PDrm addr:$src)>; } // Predicates = [UseSSE2] @@ -4124,9 +4138,9 @@ // These instructions also write zeros in the high part of a 256-bit register. def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), + def : Pat<(v4i32 (X86vzload32 addr:$src)), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v8i32 (X86vzload addr:$src)), + def : Pat<(v8i32 (X86vzload32 addr:$src)), (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; } @@ -4138,7 +4152,7 @@ (MOV64toPQIrr GR64:$src)>; def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzload addr:$src)), + def : Pat<(v4i32 (X86vzload32 addr:$src)), (MOVDI2PDIrm addr:$src)>; } @@ -4206,19 +4220,19 @@ (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; let Predicates = [UseAVX] in { - def : Pat<(v2i64 (X86vzload addr:$src)), + def : Pat<(v2i64 (X86vzload64 addr:$src)), (VMOVQI2PQIrm addr:$src)>; - def : Pat<(v4i64 (X86vzload addr:$src)), + def : Pat<(v4i64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; - def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst), + def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), (VMOVPQI2QImr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE2] in { - def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; - def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst), + def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), (MOVPQI2QImr addr:$dst, VR128:$src)>; } @@ -4368,7 +4382,7 @@ let Predicates = [HasAVX, NoVLX] in { def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))), + def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; } @@ -4376,7 +4390,7 @@ // No need for aligned memory as this only loads 64-bits. def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))), + def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (MOVDDUPrm addr:$src)>; } @@ -4953,7 +4967,7 @@ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), @@ -4961,12 +4975,12 @@ def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; } } @@ -5018,7 +5032,7 @@ (!cast(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BWrm) addr:$src)>; @@ -5026,7 +5040,7 @@ let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v4i32 addr:$src)))), + def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDrm) addr:$src)>; @@ -5040,14 +5054,14 @@ (!cast(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v4i32 addr:$src)))), + def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), (!cast(OpcPrefix#WQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQrm) addr:$src)>; @@ -5056,7 +5070,7 @@ (!cast(OpcPrefix#DQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (!cast(OpcPrefix#DQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), (!cast(OpcPrefix#DQrm) addr:$src)>; @@ -7261,10 +7275,10 @@ WriteCvtPS2PHYSt>, VEX_L; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert - (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 + (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (VCVTPH2PSrm addr:$src)>; def : Pat<(store (f64 (extractelt @@ -7436,9 +7450,9 @@ let Predicates = [HasAVX2, NoVLX] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQYrm addr:$src)>; def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), @@ -7550,7 +7564,7 @@ (VMOVDDUPrr VR128:$src)>; def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), (VMOVDDUPrm addr:$src)>; - def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))), + def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPrm addr:$src)>; } Index: llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1319,14 +1319,8 @@ ; X86-SSE-LABEL: test_mm_loadh_pi: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08] -; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04] -; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero -; X86-SSE-NEXT: shufps $0, %xmm1, %xmm2 # encoding: [0x0f,0xc6,0xd1,0x00] -; X86-SSE-NEXT: # xmm2 = xmm2[0,0],xmm1[0,0] -; X86-SSE-NEXT: shufps $36, %xmm2, %xmm0 # encoding: [0x0f,0xc6,0xc2,0x24] -; X86-SSE-NEXT: # xmm0 = xmm0[0,1],xmm2[2,0] +; X86-SSE-NEXT: movhps (%eax), %xmm0 # encoding: [0x0f,0x16,0x00] +; X86-SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,1] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_loadh_pi: @@ -1345,18 +1339,8 @@ ; ; X64-SSE-LABEL: test_mm_loadh_pi: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07] -; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8] -; X64-SSE-NEXT: shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20] -; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc] -; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8] -; X64-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc] -; X64-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero -; X64-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca] -; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1] -; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X64-SSE-NEXT: movhps (%rdi), %xmm0 # encoding: [0x0f,0x16,0x07] +; X64-SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,1] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX1-LABEL: test_mm_loadh_pi: @@ -1381,15 +1365,8 @@ ; X86-SSE-LABEL: test_mm_loadl_pi: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movss (%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x10] -; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss 4(%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x48,0x04] -; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: shufps $0, %xmm2, %xmm1 # encoding: [0x0f,0xc6,0xca,0x00] -; X86-SSE-NEXT: # xmm1 = xmm1[0,0],xmm2[0,0] -; X86-SSE-NEXT: shufps $226, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe2] -; X86-SSE-NEXT: # xmm1 = xmm1[2,0],xmm0[2,3] -; X86-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1] +; X86-SSE-NEXT: movlps (%eax), %xmm0 # encoding: [0x0f,0x12,0x00] +; X86-SSE-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_loadl_pi: @@ -1408,19 +1385,8 @@ ; ; X64-SSE-LABEL: test_mm_loadl_pi: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07] -; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8] -; X64-SSE-NEXT: shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20] -; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc] -; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8] -; X64-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc] -; X64-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero -; X64-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca] -; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4] -; X64-SSE-NEXT: # xmm1 = xmm1[0,1],xmm0[2,3] -; X64-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1] +; X64-SSE-NEXT: movlps (%rdi), %xmm0 # encoding: [0x0f,0x12,0x07] +; X64-SSE-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX1-LABEL: test_mm_loadl_pi: @@ -2818,13 +2784,7 @@ ; X86-SSE-LABEL: test_mm_storeh_pi2: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movaps %xmm0, %xmm1 # encoding: [0x0f,0x28,0xc8] -; X86-SSE-NEXT: movhlps %xmm0, %xmm1 # encoding: [0x0f,0x12,0xc8] -; X86-SSE-NEXT: # xmm1 = xmm0[1],xmm1[1] -; X86-SSE-NEXT: shufps $231, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0xe7] -; X86-SSE-NEXT: # xmm0 = xmm0[3,1,2,3] -; X86-SSE-NEXT: movss %xmm0, 4(%eax) # encoding: [0xf3,0x0f,0x11,0x40,0x04] -; X86-SSE-NEXT: movss %xmm1, (%eax) # encoding: [0xf3,0x0f,0x11,0x08] +; X86-SSE-NEXT: movhps %xmm0, (%eax) # encoding: [0x0f,0x17,0x00] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_storeh_pi2: @@ -2841,11 +2801,7 @@ ; ; X64-SSE-LABEL: test_mm_storeh_pi2: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movhlps %xmm0, %xmm0 # encoding: [0x0f,0x12,0xc0] -; X64-SSE-NEXT: # xmm0 = xmm0[1,1] -; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8] -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8] -; X64-SSE-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07] +; X64-SSE-NEXT: movhps %xmm0, (%rdi) # encoding: [0x0f,0x17,0x07] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX1-LABEL: test_mm_storeh_pi2: @@ -2922,10 +2878,7 @@ ; X86-SSE-LABEL: test_mm_storel_pi2: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movss %xmm0, (%eax) # encoding: [0xf3,0x0f,0x11,0x00] -; X86-SSE-NEXT: shufps $229, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0xe5] -; X86-SSE-NEXT: # xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movss %xmm0, 4(%eax) # encoding: [0xf3,0x0f,0x11,0x40,0x04] +; X86-SSE-NEXT: movlps %xmm0, (%eax) # encoding: [0x0f,0x13,0x00] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_storel_pi2: @@ -2942,9 +2895,7 @@ ; ; X64-SSE-LABEL: test_mm_storel_pi2: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8] -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8] -; X64-SSE-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07] +; X64-SSE-NEXT: movlps %xmm0, (%rdi) # encoding: [0x0f,0x13,0x07] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX1-LABEL: test_mm_storel_pi2: Index: llvm/test/CodeGen/X86/vector-shuffle-sse1.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-sse1.ll +++ llvm/test/CodeGen/X86/vector-shuffle-sse1.ll @@ -230,15 +230,7 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE1-LABEL: insert_mem_lo_v4f32: ; SSE1: # %bb.0: -; SSE1-NEXT: movq (%rdi), %rax -; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; SSE1-NEXT: shrq $32, %rax -; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; SSE1-NEXT: retq %a = load <2 x float>, <2 x float>* %ptr %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> @@ -249,14 +241,7 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE1-LABEL: insert_mem_hi_v4f32: ; SSE1: # %bb.0: -; SSE1-NEXT: movq (%rdi), %rax -; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; SSE1-NEXT: shrq $32, %rax -; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE1-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE1-NEXT: retq %a = load <2 x float>, <2 x float>* %ptr %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32>