Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2812,21 +2812,17 @@ const ConstantInt *Val = cast(Imm)->getConstantIntValue(); Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); - // If there is a load, it will be behind a bitcast. We don't need to check - // alignment on this load. + // Try to fold a load. No need to check alignment. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() && - tryFoldLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2, - Tmp3, Tmp4)) { - SDValue Load = N1.getOperand(0); + if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, - Load.getOperand(0) }; + N1.getOperand(0) }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. - ReplaceUses(Load.getValue(1), SDValue(CNode, 2)); + ReplaceUses(N1.getValue(1), SDValue(CNode, 2)); // Record the mem-refs - CurDAG->setNodeMemRefs(CNode, {cast(Load)->getMemOperand()}); + CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); return CNode; } @@ -2849,22 +2845,18 @@ const ConstantInt *Val = cast(Imm)->getConstantIntValue(); Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); - // If there is a load, it will be behind a bitcast. We don't need to check - // alignment on this load. + // Try to fold a load. No need to check alignment. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() && - tryFoldLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2, - Tmp3, Tmp4)) { - SDValue Load = N2.getOperand(0); + if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, - Load.getOperand(0), InFlag }; + N2.getOperand(0), InFlag }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); InFlag = SDValue(CNode, 3); // Update the chain. - ReplaceUses(Load.getValue(1), SDValue(CNode, 2)); + ReplaceUses(N2.getValue(1), SDValue(CNode, 2)); // Record the mem-refs - CurDAG->setNodeMemRefs(CNode, {cast(Load)->getMemOperand()}); + CurDAG->setNodeMemRefs(CNode, {cast(N2)->getMemOperand()}); return CNode; } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -869,11 +869,6 @@ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } - // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { - setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64); - } - // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); @@ -1178,11 +1173,6 @@ if (HasInt256) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. - for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { - setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64); - } - if (HasInt256) { // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); @@ -1419,10 +1409,6 @@ setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } - for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { - setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64); - } - // Need to custom split v32i16/v64i8 bitcasts. if (!Subtarget.hasBWI()) { setOperationAction(ISD::BITCAST, MVT::v32i16, Custom); @@ -5539,7 +5525,15 @@ if (!CNode || CNode->isMachineConstantPoolEntry()) return nullptr; - return dyn_cast(CNode->getConstVal()); + const Constant *C = CNode->getConstVal(); + if (!C) + return nullptr; + + // Make sure the load and Constant are the same size. + if (C->getType()->getPrimitiveSizeInBits() != Op.getValueSizeInBits()) + return nullptr; + + return C; } // Extract raw constant bits from constant pools. Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -66,21 +66,16 @@ !if (!eq (EltTypeName, "f64"), !cast("sdmem"), ?)); // Load patterns - // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64 - // due to load promotion during legalization - PatFrag LdFrag = !cast("load" # - !if (!eq (TypeVariantName, "i"), - !if (!eq (Size, 128), "v2i64", - !if (!eq (Size, 256), "v4i64", - !if (!eq (Size, 512), "v8i64", - VTName))), VTName)); - - PatFrag AlignedLdFrag = !cast("alignedload" # - !if (!eq (TypeVariantName, "i"), - !if (!eq (Size, 128), "v2i64", - !if (!eq (Size, 256), "v4i64", - !if (!eq (Size, 512), "v8i64", - VTName))), VTName)); + PatFrag LdFrag = !cast("load" # VTName); + + PatFrag i64LdFrag = !cast("load" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + !if (!eq (Size, 512), "v8i64", + VTName))), VTName)); + + PatFrag AlignedLdFrag = !cast("alignedload" # VTName); PatFrag ScalarLdFrag = !cast("load" # EltVT); @@ -518,10 +513,10 @@ "vinsert" # From.EltTypeName # "x" # From.NumElts, "$src3, $src2, $src1", "$src1, $src2, $src3", (vinsert_insert:$src3 (To.VT To.RC:$src1), - (From.VT (bitconvert (From.LdFrag addr:$src2))), + (From.VT (From.LdFrag addr:$src2)), (iPTR imm)), (vinsert_for_mask:$src3 (To.VT To.RC:$src1), - (From.VT (bitconvert (From.LdFrag addr:$src2))), + (From.VT (From.LdFrag addr:$src2)), (iPTR imm))>, AVX512AIi8Base, EVEX_4V, EVEX_CD8, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -547,7 +542,7 @@ def : Pat<(vinsert_insert:$ins (To.VT To.RC:$src1), - (From.VT (bitconvert (From.LdFrag addr:$src2))), + (From.VT (From.LdFrag addr:$src2)), (iPTR imm)), (To.VT (!cast(InstrStr#"rm") To.RC:$src1, addr:$src2, @@ -680,9 +675,7 @@ (vselect Cast.KRCWM:$mask, (bitconvert (vinsert_insert:$ins (To.VT To.RC:$src1), - (From.VT - (bitconvert - (From.LdFrag addr:$src2))), + (From.VT (From.LdFrag addr:$src2)), (iPTR imm))), Cast.ImmAllZerosV)), (!cast(InstrStr#"rmkz") @@ -1374,7 +1367,7 @@ defm rm : AVX512_maskable, + (_Src.VT (_Src.LdFrag addr:$src))))>, Sched<[SchedWriteShuffle.YMM.Folded]>, AVX5128IBase, EVEX; } @@ -1389,7 +1382,7 @@ (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", (null_frag), (_Dst.VT (X86SubVBroadcast - (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, + (_Src.VT (_Src.LdFrag addr:$src))))>, Sched<[SchedWriteShuffle.YMM.Folded]>, AVX5128IBase, EVEX; } @@ -1442,11 +1435,11 @@ let Predicates = [HasAVX512] in { def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), (VBROADCASTF64X4rm addr:$src)>; -def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), +def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))), (VBROADCASTI64X4rm addr:$src)>; -def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))), +def : Pat<(v32i16 (X86SubVBroadcast (loadv16i16 addr:$src))), (VBROADCASTI64X4rm addr:$src)>; -def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), +def : Pat<(v64i8 (X86SubVBroadcast (loadv32i8 addr:$src))), (VBROADCASTI64X4rm addr:$src)>; // Provide fallback in case the load node that is used in the patterns above @@ -1474,9 +1467,9 @@ (VBROADCASTF32X4rm addr:$src)>; def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), +def : Pat<(v32i16 (X86SubVBroadcast (loadv8i16 addr:$src))), (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), +def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))), (VBROADCASTI32X4rm addr:$src)>; // Patterns for selects of bitcasted operations. @@ -1506,11 +1499,11 @@ VR512:$src0), (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))), + (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), (bc_v8i64 (v16i32 immAllZerosV))), (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))), + (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), VR512:$src0), (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } @@ -1527,9 +1520,9 @@ (VBROADCASTF32X4Z256rm addr:$src)>; def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTI32X4Z256rm addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), +def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), (VBROADCASTI32X4Z256rm addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), +def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), (VBROADCASTI32X4Z256rm addr:$src)>; // Patterns for selects of bitcasted operations. @@ -1591,11 +1584,11 @@ VR256X:$src0), (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), (bc_v4i64 (v8i32 immAllZerosV))), (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), VR256X:$src0), (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; } @@ -1641,11 +1634,11 @@ VR512:$src0), (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), (bc_v8i64 (v16i32 immAllZerosV))), (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), VR512:$src0), (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } @@ -1741,7 +1734,7 @@ (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1, - (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>, + (_.VT (_.LdFrag addr:$src3)))), 1>, EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -1859,7 +1852,7 @@ (ins IdxVT.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, - (bitconvert (_.LdFrag addr:$src3)))), 1>, + (_.LdFrag addr:$src3))), 1>, EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -2149,7 +2142,7 @@ (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, + (_.VT (_.LdFrag addr:$src2))))]>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCommutable = IsCommutable in def rrk : AVX512BI, + (_.VT (_.LdFrag addr:$src2)))))]>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2291,7 +2283,7 @@ [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))), + (_.VT (_.LdFrag addr:$src2)), cond)))]>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCommutable = 1 in @@ -2316,8 +2308,7 @@ (_.KVT (Frag:$cc (_.VT _.RC:$src1), - (_.VT (bitconvert - (_.LdFrag addr:$src2))), + (_.VT (_.LdFrag addr:$src2)), cond))))]>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -2352,13 +2343,13 @@ NotMemoryFoldable; } - def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)), + def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2), (_.VT _.RC:$src1), cond)), (!cast(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)), + (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2), (_.VT _.RC:$src1), cond))), (!cast(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, @@ -2544,7 +2535,7 @@ "vcmp${cc}"#_.Suffix, "$src2, $src1", "$src1, $src2", (X86cmpm (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))), + (_.VT (_.LdFrag addr:$src2)), imm:$cc)>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -2732,7 +2723,7 @@ OpcodeStr##_.Suffix##mem# "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(OpNode - (_.VT (bitconvert (_.LdFrag addr:$src1))), + (_.VT (_.LdFrag addr:$src1)), (i32 imm:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmb : AVX512, EVEX, Sched<[Sched.RM]>, EVEX2VEXOverride; @@ -3372,7 +3363,7 @@ "${dst} {${mask}}, $src1}"), [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, - (_.VT (bitconvert (ld_frag addr:$src1))), + (_.VT (ld_frag addr:$src1)), (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K, Sched<[Sched.RM]>; } @@ -3381,7 +3372,7 @@ OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"# "${dst} {${mask}} {z}, $src}", [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, - (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))], + (_.VT (ld_frag addr:$src)), _.ImmAllZerosV)))], _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>; } def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)), @@ -3681,6 +3672,20 @@ } let Predicates = [HasAVX512] in { + // 512-bit load. + def : Pat<(alignedloadv16i32 addr:$src), + (VMOVDQA64Zrm addr:$src)>; + def : Pat<(alignedloadv32i16 addr:$src), + (VMOVDQA64Zrm addr:$src)>; + def : Pat<(alignedloadv64i8 addr:$src), + (VMOVDQA64Zrm addr:$src)>; + def : Pat<(loadv16i32 addr:$src), + (VMOVDQU64Zrm addr:$src)>; + def : Pat<(loadv32i16 addr:$src), + (VMOVDQU64Zrm addr:$src)>; + def : Pat<(loadv64i8 addr:$src), + (VMOVDQU64Zrm addr:$src)>; + // 512-bit store. def : Pat<(alignedstore (v16i32 VR512:$src), addr:$dst), (VMOVDQA64Zmr addr:$dst, VR512:$src)>; @@ -3697,6 +3702,20 @@ } let Predicates = [HasVLX] in { + // 128-bit load. + def : Pat<(alignedloadv4i32 addr:$src), + (VMOVDQA64Z128rm addr:$src)>; + def : Pat<(alignedloadv8i16 addr:$src), + (VMOVDQA64Z128rm addr:$src)>; + def : Pat<(alignedloadv16i8 addr:$src), + (VMOVDQA64Z128rm addr:$src)>; + def : Pat<(loadv4i32 addr:$src), + (VMOVDQU64Z128rm addr:$src)>; + def : Pat<(loadv8i16 addr:$src), + (VMOVDQU64Z128rm addr:$src)>; + def : Pat<(loadv16i8 addr:$src), + (VMOVDQU64Z128rm addr:$src)>; + // 128-bit store. def : Pat<(alignedstore (v4i32 VR128X:$src), addr:$dst), (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; @@ -3711,6 +3730,20 @@ def : Pat<(store (v16i8 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; + // 256-bit load. + def : Pat<(alignedloadv8i32 addr:$src), + (VMOVDQA64Z256rm addr:$src)>; + def : Pat<(alignedloadv16i16 addr:$src), + (VMOVDQA64Z256rm addr:$src)>; + def : Pat<(alignedloadv32i8 addr:$src), + (VMOVDQA64Z256rm addr:$src)>; + def : Pat<(loadv8i32 addr:$src), + (VMOVDQU64Z256rm addr:$src)>; + def : Pat<(loadv16i16 addr:$src), + (VMOVDQU64Z256rm addr:$src)>; + def : Pat<(loadv32i8 addr:$src), + (VMOVDQU64Z256rm addr:$src)>; + // 256-bit store. def : Pat<(alignedstore (v8i32 VR256X:$src), addr:$dst), (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; @@ -4495,7 +4528,7 @@ (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIZrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzload addr:$src)), (VMOVDI2PDIZrm addr:$src)>; @@ -4591,6 +4624,12 @@ (VMOVNTDQAZrm addr:$src)>; def : Pat<(v8i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v16i32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v32i16 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v64i8 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; } let Predicates = [HasVLX], AddedComplexity = 400 in { @@ -4607,6 +4646,12 @@ (VMOVNTDQAZ256rm addr:$src)>; def : Pat<(v4i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v8i32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v16i16 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v32i8 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst), (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>; @@ -4621,6 +4666,12 @@ (VMOVNTDQAZ128rm addr:$src)>; def : Pat<(v2i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v4i32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v8i16 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v16i8 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -4639,8 +4690,7 @@ defm rm : AVX512_maskable, + (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2)))>, AVX512BIBase, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4771,7 +4821,7 @@ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), - (bitconvert (_Src.LdFrag addr:$src2))))>, + (_Src.LdFrag addr:$src2)))>, AVX512BIBase, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -4876,7 +4926,7 @@ (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), - (bitconvert (_Src.LdFrag addr:$src2))))>, + (_Src.LdFrag addr:$src2)))>, EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5068,7 +5118,7 @@ (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)), (bitconvert (_.LdFrag addr:$src2)))), (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1, - (bitconvert (_.LdFrag addr:$src2))))))>, + (_.i64LdFrag addr:$src2)))))>, AVX512BIBase, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5641,7 +5691,7 @@ "$src2, $src1", "$src1, $src2", (OpNode (bitconvert (_.i64VT (and _.RC:$src1, - (bitconvert (_.LdFrag addr:$src2))))), + (_.i64LdFrag addr:$src2)))), _.ImmAllZerosV)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -5805,7 +5855,7 @@ defm mi : AVX512_maskable, Sched<[sched.Folded]>; } @@ -5835,8 +5885,7 @@ defm rm : AVX512_maskable, + (_.VT (OpNode _.RC:$src1, (SrcVT (load addr:$src2))))>, AVX512BIBase, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5990,7 +6039,7 @@ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, - (_.VT (bitconvert (_.LdFrag addr:$src2)))))>, + (_.VT (_.LdFrag addr:$src2))))>, AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6090,7 +6139,7 @@ def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)), (!cast(InstrStr#_.ZSuffix#rr) _.RC:$src1, _.RC:$src2)>; - def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), + def : Pat<(_.VT (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2))), (!cast(InstrStr#_.ZSuffix##rm) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, @@ -6098,7 +6147,7 @@ (!cast(InstrStr#_.ZSuffix#rrk) _.RC:$src0, _.KRC:$mask, _.RC:$src1, _.RC:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))), + (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)), _.RC:$src0)), (!cast(InstrStr#_.ZSuffix##rmk) _.RC:$src0, _.KRC:$mask, _.RC:$src1, addr:$src2)>; @@ -6107,7 +6156,7 @@ (!cast(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask, _.RC:$src1, _.RC:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))), + (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)), _.ImmAllZerosV)), (!cast(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask, _.RC:$src1, addr:$src2)>; @@ -6332,7 +6381,7 @@ "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, - (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>, + (Ctrl.VT (Ctrl.LdFrag addr:$src2))))>, T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmb: AVX512_maskable, + (_Src.LdFrag addr:$src))))>, EVEX, Sched<[sched.Folded]>; defm rmb : AVX512_maskable, + (ld_frag addr:$src)))>, T8PD, Sched<[sched.Folded]>; } @@ -8341,17 +8389,17 @@ } let Predicates = [HasAVX512] in - defm VCVTPH2PSZ : avx512_cvtph2ps, avx512_cvtph2ps_sae, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { defm VCVTPH2PSZ256 : avx512_cvtph2ps, EVEX, EVEX_V256, + load, WriteCvtPH2PSY>, EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; defm VCVTPH2PSZ128 : avx512_cvtph2ps, EVEX, EVEX_V128, + load, WriteCvtPH2PS>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; // Pattern match vcvtph2ps of a scalar i64 load. @@ -9295,7 +9343,7 @@ (!cast(OpcPrefix#BWZ128rm) addr:$src)>; def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; } let Predicates = [HasVLX] in { @@ -9305,7 +9353,7 @@ (!cast(OpcPrefix#BDZ128rm) addr:$src)>; def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), @@ -9314,7 +9362,7 @@ (!cast(OpcPrefix#BQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQZ128rm) addr:$src)>; def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -9325,7 +9373,7 @@ (!cast(OpcPrefix#WDZ128rm) addr:$src)>; def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), @@ -9334,7 +9382,7 @@ (!cast(OpcPrefix#WQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -9345,12 +9393,12 @@ (!cast(OpcPrefix#DQZ128rm) addr:$src)>; def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; } // 256-bit patterns let Predicates = [HasVLX, HasBWI] in { - def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BWZ256rm) addr:$src)>; def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#BWZ256rm) addr:$src)>; @@ -9364,7 +9412,7 @@ (!cast(OpcPrefix#BDZ256rm) addr:$src)>; def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), @@ -9373,10 +9421,10 @@ (!cast(OpcPrefix#BQZ256rm) addr:$src)>; def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WDZ256rm) addr:$src)>; def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#WDZ256rm) addr:$src)>; @@ -9389,10 +9437,10 @@ (!cast(OpcPrefix#WQZ256rm) addr:$src)>; def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), (!cast(OpcPrefix#DQZ256rm) addr:$src)>; def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#DQZ256rm) addr:$src)>; @@ -9401,25 +9449,25 @@ } // 512-bit patterns let Predicates = [HasBWI] in { - def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))), + def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))), (!cast(OpcPrefix#BWZrm) addr:$src)>; } let Predicates = [HasAVX512] in { - def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDZrm) addr:$src)>; def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BQZrm) addr:$src)>; - def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v8i64 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQZrm) addr:$src)>; - def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))), + def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))), (!cast(OpcPrefix#WDZrm) addr:$src)>; - def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQZrm) addr:$src)>; - def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))), + def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))), (!cast(OpcPrefix#DQZrm) addr:$src)>; } } @@ -10324,7 +10372,7 @@ (_.VT (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, - (bitconvert (_.LdFrag addr:$src2)), + (CastInfo.LdFrag addr:$src2), (i8 imm:$src3)))))>, Sched<[sched.Folded, sched.ReadAfterFold]>, EVEX2VEXOverride; @@ -10490,7 +10538,7 @@ def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, - (bitconvert (To.LdFrag addr:$src2)), + (From.LdFrag addr:$src2), imm:$src3))), To.RC:$src0)), (!cast(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask, @@ -10500,7 +10548,7 @@ def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, - (bitconvert (To.LdFrag addr:$src2)), + (From.LdFrag addr:$src2), imm:$src3))), To.ImmAllZerosV)), (!cast(OpcodeStr#"rmikz") To.KRCWM:$mask, @@ -11644,7 +11692,7 @@ (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, "$src3, $src2", "$src2, $src3", (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (bitconvert (VTI.LdFrag addr:$src3)))))>, + (VTI.VT (VTI.LdFrag addr:$src3))))>, AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -11747,8 +11795,7 @@ (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, "$src3, $src2", "$src2, $src3", (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (bitconvert - (VTI.LdFrag addr:$src3)))))>, + (VTI.VT (VTI.LdFrag addr:$src3))))>, EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb : AVX512_maskable_3src, + (VTI.VT (VTI.LdFrag addr:$src2)))>, EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; } Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -648,21 +648,28 @@ //===----------------------------------------------------------------------===// // 128-bit load pattern fragments -// NOTE: all 128-bit integer vector loads are promoted to v2i64 def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; +def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>; +def loadv8i16 : PatFrag<(ops node:$ptr), (v8i16 (load node:$ptr))>; +def loadv16i8 : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>; // 256-bit load pattern fragments -// NOTE: all 256-bit integer vector loads are promoted to v4i64 -def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; -def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; -def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; +def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; +def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; +def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; +def loadv8i32 : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>; +def loadv16i16 : PatFrag<(ops node:$ptr), (v16i16 (load node:$ptr))>; +def loadv32i8 : PatFrag<(ops node:$ptr), (v32i8 (load node:$ptr))>; // 512-bit load pattern fragments def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; -def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; -def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; +def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; +def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; +def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>; +def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>; +def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>; // 128-/256-/512-bit extload pattern fragments def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; @@ -690,15 +697,27 @@ (v2f64 (alignedload node:$ptr))>; def alignedloadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (alignedload node:$ptr))>; +def alignedloadv4i32 : PatFrag<(ops node:$ptr), + (v4i32 (alignedload node:$ptr))>; +def alignedloadv8i16 : PatFrag<(ops node:$ptr), + (v8i16 (alignedload node:$ptr))>; +def alignedloadv16i8 : PatFrag<(ops node:$ptr), + (v16i8 (alignedload node:$ptr))>; // 256-bit aligned load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 -def alignedloadv8f32 : PatFrag<(ops node:$ptr), - (v8f32 (alignedload node:$ptr))>; -def alignedloadv4f64 : PatFrag<(ops node:$ptr), - (v4f64 (alignedload node:$ptr))>; -def alignedloadv4i64 : PatFrag<(ops node:$ptr), - (v4i64 (alignedload node:$ptr))>; +def alignedloadv8f32 : PatFrag<(ops node:$ptr), + (v8f32 (alignedload node:$ptr))>; +def alignedloadv4f64 : PatFrag<(ops node:$ptr), + (v4f64 (alignedload node:$ptr))>; +def alignedloadv4i64 : PatFrag<(ops node:$ptr), + (v4i64 (alignedload node:$ptr))>; +def alignedloadv8i32 : PatFrag<(ops node:$ptr), + (v8i32 (alignedload node:$ptr))>; +def alignedloadv16i16 : PatFrag<(ops node:$ptr), + (v16i16 (alignedload node:$ptr))>; +def alignedloadv32i8 : PatFrag<(ops node:$ptr), + (v32i8 (alignedload node:$ptr))>; // 512-bit aligned load pattern fragments def alignedloadv16f32 : PatFrag<(ops node:$ptr), @@ -707,6 +726,12 @@ (v8f64 (alignedload node:$ptr))>; def alignedloadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (alignedload node:$ptr))>; +def alignedloadv16i32 : PatFrag<(ops node:$ptr), + (v16i32 (alignedload node:$ptr))>; +def alignedloadv32i16 : PatFrag<(ops node:$ptr), + (v32i16 (alignedload node:$ptr))>; +def alignedloadv64i8 : PatFrag<(ops node:$ptr), + (v64i8 (alignedload node:$ptr))>; // Like 'load', but uses special alignment checks suitable for use in // memory operands in most SSE instructions, which are required to @@ -725,6 +750,9 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; +def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>; +def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>; +def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>; def X86masked_gather : SDNode<"X86ISD::MGATHER", SDTypeProfile<2, 3, [SDTCisVec<0>, Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -589,8 +589,21 @@ // available and changing the domain is beneficial. def : Pat<(alignedloadv4i64 addr:$src), (VMOVAPSYrm addr:$src)>; + def : Pat<(alignedloadv8i32 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(alignedloadv16i16 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(alignedloadv32i8 addr:$src), + (VMOVAPSYrm addr:$src)>; def : Pat<(loadv4i64 addr:$src), (VMOVUPSYrm addr:$src)>; + def : Pat<(loadv8i32 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(loadv16i16 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(loadv32i8 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), (VMOVAPSYmr addr:$dst, VR256:$src)>; def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), @@ -615,8 +628,20 @@ let Predicates = [UseSSE1] in { def : Pat<(alignedloadv2i64 addr:$src), (MOVAPSrm addr:$src)>; + def : Pat<(alignedloadv4i32 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(alignedloadv8i16 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(alignedloadv16i8 addr:$src), + (MOVAPSrm addr:$src)>; def : Pat<(loadv2i64 addr:$src), (MOVUPSrm addr:$src)>; + def : Pat<(loadv4i32 addr:$src), + (MOVUPSrm addr:$src)>; + def : Pat<(loadv8i16 addr:$src), + (MOVUPSrm addr:$src)>; + def : Pat<(loadv16i8 addr:$src), + (MOVUPSrm addr:$src)>; def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), (MOVAPSmr addr:$dst, VR128:$src)>; @@ -841,7 +866,7 @@ let mayLoad = 1 in def rm : I, + (SrcTy (ld_frag addr:$src)))))], d>, Sched<[sched.Folded]>; } } @@ -1104,16 +1129,16 @@ ssmem, sse_load_f32, "cvtss2si", WriteCvtSS2I>, XS, REX_W; -defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64, +defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, WriteCvtI2PS>, PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; -defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64, +defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, WriteCvtI2PSY>, PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; -defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64, +defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, "cvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, WriteCvtI2PS>, PS, Requires<[UseSSE2]>; @@ -1672,7 +1697,7 @@ def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, + (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>, VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", @@ -1682,7 +1707,7 @@ def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>, + (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, VEX_WIG; def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), @@ -1696,7 +1721,7 @@ def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, + (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>, Sched<[WriteCvtI2PDLd]>; def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", @@ -2151,54 +2176,54 @@ } let Predicates = [HasAVX, NoVLX] in { -defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, +defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; -defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, +defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; -defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, +defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; -defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, +defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; -defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, +defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; -defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, +defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; -defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, +defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; -defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, +defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; }// Predicates = [HasAVX, NoVLX] let Constraints = "$src1 = $dst" in { - defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, + defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; - defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, + defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; - defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, + defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; - defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, + defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; } // Constraints = "$src1 = $dst" let Predicates = [HasAVX1Only] in { - def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; @@ -2284,8 +2309,7 @@ !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)))))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } // ExeDomain = SSEPackedInt @@ -2296,16 +2320,16 @@ Predicate prd> { let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rm, VEX_4V, VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rm; + memop, i128mem, sched.XMM, IsCommutable, 1>; let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rm, VEX_4V, VEX_L, VEX_WIG; } @@ -3306,6 +3330,19 @@ let Predicates = [HasAVX, NoVLX] in { // Additional patterns for other integer sizes. + def : Pat<(alignedloadv4i32 addr:$src), + (VMOVDQArm addr:$src)>; + def : Pat<(alignedloadv8i16 addr:$src), + (VMOVDQArm addr:$src)>; + def : Pat<(alignedloadv16i8 addr:$src), + (VMOVDQArm addr:$src)>; + def : Pat<(loadv4i32 addr:$src), + (VMOVDQUrm addr:$src)>; + def : Pat<(loadv8i16 addr:$src), + (VMOVDQUrm addr:$src)>; + def : Pat<(loadv16i8 addr:$src), + (VMOVDQUrm addr:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), (VMOVDQAmr addr:$dst, VR128:$src)>; def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), @@ -3345,7 +3382,7 @@ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), - (bitconvert (memop_frag addr:$src2)))))]>, + (memop_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } // ExeDomain = SSEPackedInt @@ -3405,28 +3442,28 @@ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, - loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, + load, i128mem, SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, - VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM, + VR256, load, i256mem, SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, - memopv2i64, i128mem, SchedWriteVecIMul.XMM>; + memop, i128mem, SchedWriteVecIMul.XMM>; let Predicates = [HasAVX, NoVLX_Or_NoBWI] in defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, - loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>, + load, i128mem, SchedWritePSADBW.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, - loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>, + load, i256mem, SchedWritePSADBW.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, - memopv2i64, i128mem, SchedWritePSADBW.XMM>; + memop, i128mem, SchedWritePSADBW.XMM>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Logical Instructions @@ -3453,7 +3490,7 @@ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (DstVT (OpNode RC:$src1, - (SrcVT (bitconvert (ld_frag addr:$src2))))))]>, + (SrcVT (ld_frag addr:$src2)))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def ri : PDIi8, VEX_4V, VEX_WIG; + DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rmi, VEX_4V, VEX_L, + DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rmi; + memop>; } multiclass PDI_binop_ri opc, Format ImmForm, string OpcodeStr, @@ -3582,7 +3619,7 @@ !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), + (vt128 (OpNode (load addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; } @@ -3600,7 +3637,7 @@ !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), + (vt256 (OpNode (load addr:$src1), (i8 imm:$src2))))]>, VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; } @@ -3618,7 +3655,7 @@ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), + (vt128 (OpNode (memop addr:$src1), (i8 imm:$src2))))]>, Sched<[sched.XMM.Folded]>; } @@ -3658,7 +3695,7 @@ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OutVT (OpNode (ArgVT RC:$src1), - (bitconvert (ld_frag addr:$src2)))))]>, + (ld_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -3683,53 +3720,53 @@ "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (OutVT (OpNode (ArgVT RC:$src1), - (bitconvert (ld_frag addr:$src2)))))]>, + (ld_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; } } // ExeDomain = SSEPackedInt @@ -3754,89 +3791,88 @@ !if(Is2Addr, !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, - (bitconvert (ld_frag addr:$src2)))))]>, + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX, NoVLX] in { defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + i128mem, SchedWriteShuffle.XMM, load, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2, NoVLX] in { defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, - i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + i256mem, SchedWriteShuffle.YMM, load, 0>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, - i128mem, SchedWriteShuffle.XMM, memopv2i64>; + i128mem, SchedWriteShuffle.XMM, memop>; } } // ExeDomain = SSEPackedInt @@ -4155,7 +4191,7 @@ (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzload addr:$src)), (VMOVDI2PDIrm addr:$src)>; @@ -4180,7 +4216,7 @@ (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; - def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzload addr:$src)), (MOVDI2PDIrm addr:$src)>; @@ -4335,30 +4371,30 @@ let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (VMOVSHDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movshdup (load addr:$src))), (VMOVSHDUPrm addr:$src)>; def : Pat<(v4i32 (X86Movsldup VR128:$src)), (VMOVSLDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movsldup (load addr:$src))), (VMOVSLDUPrm addr:$src)>; def : Pat<(v8i32 (X86Movshdup VR256:$src)), (VMOVSHDUPYrr VR256:$src)>; - def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))), + def : Pat<(v8i32 (X86Movshdup (load addr:$src))), (VMOVSHDUPYrm addr:$src)>; def : Pat<(v8i32 (X86Movsldup VR256:$src)), (VMOVSLDUPYrr VR256:$src)>; - def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))), + def : Pat<(v8i32 (X86Movsldup (load addr:$src))), (VMOVSLDUPYrm addr:$src)>; } let Predicates = [UseSSE3] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (MOVSHDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), (MOVSHDUPrm addr:$src)>; def : Pat<(v4i32 (X86Movsldup VR128:$src)), (MOVSLDUPrr VR128:$src)>; - def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), (MOVSLDUPrm addr:$src)>; } @@ -4580,7 +4616,7 @@ (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>, + (vt (OpNode (ld_frag addr:$src))))]>, Sched<[sched.XMM.Folded]>; } @@ -4597,19 +4633,19 @@ (ins i256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>, + (vt (OpNode (load addr:$src))))]>, Sched<[sched.YMM.Folded]>; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, - loadv2i64>, VEX, VEX_WIG; + load>, VEX, VEX_WIG; defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, - loadv2i64>, VEX, VEX_WIG; + load>, VEX, VEX_WIG; } let Predicates = [HasAVX, NoVLX] in { defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, - loadv2i64>, VEX, VEX_WIG; + load>, VEX, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, @@ -4623,11 +4659,11 @@ } defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, - memopv2i64>; + memop>; defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, - memopv2i64>; + memop>; defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, - memopv2i64>; + memop>; //===---------------------------------------------------------------------===// // SSSE3 - Packed Binary Operator Instructions @@ -4652,8 +4688,7 @@ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, - (DstVT (OpNode (OpVT RC:$src1), - (bitconvert (memop_frag addr:$src2)))))]>, + (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4675,8 +4710,7 @@ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, - (IntId128 VR128:$src1, - (bitconvert (ld_frag addr:$src2))))]>, + (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4693,83 +4727,83 @@ (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, + (IntId256 VR256:$src1, (load addr:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { let isCommutable = 0 in { defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, - VR128, loadv2i64, i128mem, + VR128, load, i128mem, SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, - v16i8, VR128, loadv2i64, i128mem, + v16i8, VR128, load, i128mem, SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; } defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, - VR128, loadv2i64, i128mem, + VR128, load, i128mem, SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; } let ImmT = NoImm, Predicates = [HasAVX] in { let isCommutable = 0 in { defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, - loadv2i64, i128mem, + load, i128mem, SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, - loadv2i64, i128mem, + load, i128mem, SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, - loadv2i64, i128mem, + load, i128mem, SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, - loadv2i64, i128mem, + load, i128mem, SchedWritePHAdd.XMM, 0>, VEX_4V; defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", int_x86_ssse3_psign_b_128, - SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", int_x86_ssse3_psign_w_128, - SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", int_x86_ssse3_psign_d_128, - SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; } } let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { let isCommutable = 0 in { defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, - VR256, loadv4i64, i256mem, + VR256, load, i256mem, SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, - v32i8, VR256, loadv4i64, i256mem, + v32i8, VR256, load, i256mem, SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; } defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, - VR256, loadv4i64, i256mem, + VR256, load, i256mem, SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; } let ImmT = NoImm, Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, - VR256, loadv4i64, i256mem, + VR256, load, i256mem, SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, - loadv4i64, i256mem, + load, i256mem, SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, - VR256, loadv4i64, i256mem, + VR256, load, i256mem, SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, - loadv4i64, i256mem, + load, i256mem, SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L; defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; @@ -4790,33 +4824,33 @@ let ImmT = NoImm, Constraints = "$src1 = $dst" in { let isCommutable = 0 in { defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memop, i128mem, SchedWritePHAdd.XMM>; defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memop, i128mem, SchedWritePHAdd.XMM>; defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memop, i128mem, SchedWritePHAdd.XMM>; defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, - memopv2i64, i128mem, SchedWritePHAdd.XMM>; + memop, i128mem, SchedWritePHAdd.XMM>; defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, - SchedWriteVecALU.XMM, memopv2i64>; + SchedWriteVecALU.XMM, memop>; defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, - SchedWriteVecALU.XMM, memopv2i64>; + SchedWriteVecALU.XMM, memop>; defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, - SchedWriteVecALU.XMM, memopv2i64>; + SchedWriteVecALU.XMM, memop>; defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, - memopv2i64, i128mem, SchedWriteVarShuffle.XMM>; + memop, i128mem, SchedWriteVarShuffle.XMM>; defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, memopv2i64>; + SchedWritePHAdd.XMM, memop>; defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, memopv2i64>; + SchedWritePHAdd.XMM, memop>; defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, - v16i8, VR128, memopv2i64, i128mem, + v16i8, VR128, memop, i128mem, SchedWriteVecIMul.XMM>; } defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, - VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>; + VR128, memop, i128mem, SchedWriteVecIMul.XMM>; } //===---------------------------------------------------------------------===// @@ -4843,20 +4877,20 @@ !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, (VT (X86PAlignr RC:$src1, - (bitconvert (memop_frag addr:$src2)), + (memop_frag addr:$src2), (i8 imm:$src3))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in - defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem, + defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in - defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem, + defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in - defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem, + defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, SchedWriteShuffle.XMM>; //===---------------------------------------------------------------------===// @@ -4980,7 +5014,7 @@ // AVX2 Register-Memory patterns let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BWYrm) addr:$src)>; def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#BWYrm) addr:$src)>; @@ -4994,7 +5028,7 @@ (!cast(OpcPrefix#BDYrm) addr:$src)>; def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), @@ -5003,10 +5037,10 @@ (!cast(OpcPrefix#BQYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WDYrm) addr:$src)>; def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#WDYrm) addr:$src)>; @@ -5019,10 +5053,10 @@ (!cast(OpcPrefix#WQYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), (!cast(OpcPrefix#DQYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#DQYrm) addr:$src)>; @@ -5082,7 +5116,7 @@ (!cast(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BWrm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BWrm) addr:$src)>; } let Predicates = [HasAVX, NoVLX] in { @@ -5092,7 +5126,7 @@ (!cast(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), @@ -5101,7 +5135,7 @@ (!cast(OpcPrefix#BQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -5112,7 +5146,7 @@ (!cast(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WDrm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WDrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), @@ -5121,7 +5155,7 @@ (!cast(OpcPrefix#WQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -5132,7 +5166,7 @@ (!cast(OpcPrefix#DQrm) addr:$src)>; def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#DQrm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), (!cast(OpcPrefix#DQrm) addr:$src)>; } } @@ -5950,7 +5984,7 @@ (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>, + (v8i16 (OpNode (ld_frag addr:$src))))]>, Sched<[Sched.Folded]>; } @@ -5958,10 +5992,10 @@ // model, although the naming is misleading. let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", - X86phminpos, loadv2i64, + X86phminpos, load, WritePHMINPOS>, VEX, VEX_WIG; defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", - X86phminpos, memopv2i64, + X86phminpos, memop, WritePHMINPOS>; /// SS48I_binop_rm - Simple SSE41 binary operator. @@ -5983,118 +6017,118 @@ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } let Predicates = [HasAVX, NoVLX] in { defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, - loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, + load, i128mem, SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX2, NoVLX] in { defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, - loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>, + load, i256mem, SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, - memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>; + memop, i128mem, SchedWriteVecIMul.XMM, 1>; } let Predicates = [HasAVX, NoVLX] in defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, - loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>, + load, i128mem, SchedWritePMULLD.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX] in defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX] in defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>, + load, i256mem, SchedWritePMULLD.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Predicates = [HasAVX2] in defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in { defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, - memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>; + memop, i128mem, SchedWritePMULLD.XMM, 1>; defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + memop, i128mem, SchedWriteVecALU.XMM, 1>; } /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate @@ -6120,8 +6154,7 @@ !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (IntId RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6148,8 +6181,7 @@ !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6171,28 +6203,28 @@ let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, loadv2i64, i128mem, 0, + VR128, load, i128mem, 0, SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; } let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, - VR128, loadv4f32, f128mem, 0, + VR128, load, f128mem, 0, SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; let ExeDomain = SSEPackedDouble in defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, - VR128, loadv2f64, f128mem, 0, + VR128, load, f128mem, 0, SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, - VR256, loadv8f32, i256mem, 0, + VR256, load, i256mem, 0, SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, - VR256, loadv4i64, i256mem, 0, + VR256, load, i256mem, 0, SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; } } @@ -6200,17 +6232,17 @@ let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv2i64, i128mem, 1, + VR128, memop, i128mem, 1, SchedWriteMPSAD.XMM>; } let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, - VR128, memopv4f32, f128mem, 1, + VR128, memop, f128mem, 1, SchedWriteDPPS.XMM>; let ExeDomain = SSEPackedDouble in defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, - VR128, memopv2f64, f128mem, 1, + VR128, memop, f128mem, 1, SchedWriteDPPD.XMM>; } @@ -6238,56 +6270,54 @@ !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Pattern to commute if load is in first source. - def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), - RC:$src1, imm:$src3)), + def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)), (!cast(NAME#"rmi") RC:$src1, addr:$src2, (commuteXForm imm:$src3))>; } let Predicates = [HasAVX] in { defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, - VR128, loadv4f32, f128mem, 0, SSEPackedSingle, + VR128, load, f128mem, 0, SSEPackedSingle, SchedWriteFBlend.XMM, BlendCommuteImm4>, VEX_4V, VEX_WIG; defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, - VR256, loadv8f32, f256mem, 0, SSEPackedSingle, + VR256, load, f256mem, 0, SSEPackedSingle, SchedWriteFBlend.YMM, BlendCommuteImm8>, VEX_4V, VEX_L, VEX_WIG; defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, - VR128, loadv2f64, f128mem, 0, SSEPackedDouble, + VR128, load, f128mem, 0, SSEPackedDouble, SchedWriteFBlend.XMM, BlendCommuteImm2>, VEX_4V, VEX_WIG; defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, - VR256, loadv4f64, f256mem, 0, SSEPackedDouble, + VR256, load, f256mem, 0, SSEPackedDouble, SchedWriteFBlend.YMM, BlendCommuteImm4>, VEX_4V, VEX_L, VEX_WIG; defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, - VR128, loadv2i64, i128mem, 0, SSEPackedInt, + VR128, load, i128mem, 0, SSEPackedInt, SchedWriteBlend.XMM, BlendCommuteImm8>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX2] in { defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, - VR256, loadv4i64, i256mem, 0, SSEPackedInt, + VR256, load, i256mem, 0, SSEPackedInt, SchedWriteBlend.YMM, BlendCommuteImm8>, VEX_4V, VEX_L, VEX_WIG; } defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, - VR128, memopv4f32, f128mem, 1, SSEPackedSingle, + VR128, memop, f128mem, 1, SSEPackedSingle, SchedWriteFBlend.XMM, BlendCommuteImm4>; defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, - VR128, memopv2f64, f128mem, 1, SSEPackedDouble, + VR128, memop, f128mem, 1, SSEPackedDouble, SchedWriteFBlend.XMM, BlendCommuteImm2>; defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, - VR128, memopv2i64, i128mem, 1, SSEPackedInt, + VR128, memop, i128mem, 1, SSEPackedInt, SchedWriteBlend.XMM, BlendCommuteImm8>; // For insertion into the zero index (low half) of a 256-bit vector, it is @@ -6321,7 +6351,7 @@ !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), + (IntId RC:$src1, (mem_frag addr:$src2), RC:$src3))], SSEPackedInt>, TAPD, VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold, // x86memop:$src2 @@ -6334,7 +6364,7 @@ let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, - loadv2f64, int_x86_sse41_blendvpd, + load, int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, loadv4f64, int_x86_avx_blendv_pd_256, @@ -6342,20 +6372,20 @@ } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, - loadv4f32, int_x86_sse41_blendvps, + load, int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, loadv8f32, int_x86_avx_blendv_ps_256, SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedSingle defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - loadv2i64, int_x86_sse41_pblendvb, + load, int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; } let Predicates = [HasAVX2] in { defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - loadv4i64, int_x86_avx2_pblendvb, + load, int_x86_avx2_pblendvb, SchedWriteVarBlend.YMM>, VEX_L; } @@ -6486,18 +6516,18 @@ "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), [(set VR128:$dst, (IntId VR128:$src1, - (bitconvert (mem_frag addr:$src2)), XMM0))]>, + (mem_frag addr:$src2), XMM0))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } let ExeDomain = SSEPackedDouble in -defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, +defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem, int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; let ExeDomain = SSEPackedSingle in -defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, +defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem, int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; -defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, +defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem, int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; // Aliases with the implicit xmm0 argument @@ -6553,6 +6583,12 @@ (VMOVNTDQAYrm addr:$src)>; def : Pat<(v4i64 (alignednontemporalload addr:$src)), (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v8i32 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v16i16 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v32i8 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; } let Predicates = [HasAVX, NoVLX] in { @@ -6562,6 +6598,12 @@ (VMOVNTDQArm addr:$src)>; def : Pat<(v2i64 (alignednontemporalload addr:$src)), (VMOVNTDQArm addr:$src)>; + def : Pat<(v4i32 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; + def : Pat<(v8i16 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; + def : Pat<(v16i8 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; } let Predicates = [UseSSE41] in { @@ -6571,6 +6613,12 @@ (MOVNTDQArm addr:$src)>; def : Pat<(v2i64 (alignednontemporalload addr:$src)), (MOVNTDQArm addr:$src)>; + def : Pat<(v4i32 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; + def : Pat<(v8i16 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; + def : Pat<(v16i8 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; } } // AddedComplexity @@ -6603,17 +6651,17 @@ let Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, - loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + load, i128mem, SchedWriteVecALU.XMM, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2] in defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, - loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + load, i256mem, SchedWriteVecALU.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, - memopv2i64, i128mem, SchedWriteVecALU.XMM>; + memop, i128mem, SchedWriteVecALU.XMM>; //===----------------------------------------------------------------------===// // SSE4.2 - String/text Processing Instructions @@ -6764,9 +6812,9 @@ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), [!if(UsesXMM0, (set VR128:$dst, (IntId VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), + (memop addr:$src2), XMM0)), (set VR128:$dst, (IntId VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8, + (memop addr:$src2))))]>, T8, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6783,7 +6831,7 @@ "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)), + (memop addr:$src2), (i8 imm:$src3)))]>, TA, Sched<[SchedWriteVecIMul.XMM.Folded, SchedWriteVecIMul.XMM.ReadAfterFold]>; @@ -6836,39 +6884,39 @@ // Perform One Round of an AES Encryption/Decryption Flow let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", - int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG; + int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG; defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", - int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG; + int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG; defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", - int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG; + int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG; defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", - int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG; + int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG; } let Predicates = [NoVLX, HasVAES] in { defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", - int_x86_aesni_aesenc_256, loadv4i64, 0, VR256, + int_x86_aesni_aesenc_256, load, 0, VR256, i256mem>, VEX_4V, VEX_L, VEX_WIG; defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", - int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256, + int_x86_aesni_aesenclast_256, load, 0, VR256, i256mem>, VEX_4V, VEX_L, VEX_WIG; defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", - int_x86_aesni_aesdec_256, loadv4i64, 0, VR256, + int_x86_aesni_aesdec_256, load, 0, VR256, i256mem>, VEX_4V, VEX_L, VEX_WIG; defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", - int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256, + int_x86_aesni_aesdeclast_256, load, 0, VR256, i256mem>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", - int_x86_aesni_aesenc, memopv2i64, 1>; + int_x86_aesni_aesenc, memop, 1>; defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", - int_x86_aesni_aesenclast, memopv2i64, 1>; + int_x86_aesni_aesenclast, memop, 1>; defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", - int_x86_aesni_aesdec, memopv2i64, 1>; + int_x86_aesni_aesdec, memop, 1>; defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", - int_x86_aesni_aesdeclast, memopv2i64, 1>; + int_x86_aesni_aesdeclast, memop, 1>; } // Perform the AES InvMixColumn Transformation @@ -6882,7 +6930,7 @@ def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", - [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, + [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; } def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), @@ -6893,7 +6941,7 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "aesimc\t{$src1, $dst|$dst, $src1}", - [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, + [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, Sched<[WriteAESIMC.Folded]>; // AES Round Key Generation Assist @@ -6908,7 +6956,7 @@ (ins i128mem:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>, Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), @@ -6921,7 +6969,7 @@ (ins i128mem:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>, Sched<[WriteAESKeyGen.Folded]>; //===----------------------------------------------------------------------===// @@ -6949,12 +6997,12 @@ (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), + (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), imm:$src3))]>, Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; } // Constraints = "$src1 = $dst" - def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1, + def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, (i8 imm:$src3)), (PCLMULQDQrm VR128:$src1, addr:$src2, (PCLMULCommuteImm imm:$src3))>; @@ -6997,11 +7045,11 @@ } let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in -defm VPCLMULQDQ : vpclmulqdq, VEX_4V, VEX_WIG; let Predicates = [NoVLX, HasVPCLMULQDQ] in -defm VPCLMULQDQY : vpclmulqdq, VEX_4V, VEX_L, VEX_WIG; multiclass vpclmulqdq_aliases_impl; -def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), +def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), (VBROADCASTI128 addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), +def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), (VBROADCASTI128 addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), +def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), (VBROADCASTI128 addr:$src)>; } @@ -7175,11 +7223,11 @@ let Predicates = [HasAVX1Only] in { def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTF128 addr:$src)>; -def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), +def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), (VBROADCASTF128 addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), +def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), (VBROADCASTF128 addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), +def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), (VBROADCASTF128 addr:$src)>; } @@ -7212,7 +7260,7 @@ (!cast(InstrStr#rr) VR256:$src1, VR128:$src2, (INSERT_get_vinsert128_imm VR256:$ins))>; def : Pat<(vinsert128_insert:$ins (To VR256:$src1), - (From (bitconvert (memop_frag addr:$src2))), + (From (memop_frag addr:$src2)), (iPTR imm)), (!cast(InstrStr#rm) VR256:$src1, addr:$src2, (INSERT_get_vinsert128_imm VR256:$ins))>; @@ -7225,9 +7273,9 @@ let Predicates = [HasAVX1Only] in { defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; - defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>; - defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>; - defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>; + defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>; + defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>; + defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>; } //===----------------------------------------------------------------------===// @@ -7316,7 +7364,7 @@ multiclass avx_permil opc_rm, bits<8> opc_rmi, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop_f, - X86MemOperand x86memop_i, PatFrag i_frag, + X86MemOperand x86memop_i, ValueType f_vt, ValueType i_vt, X86FoldableSchedWrite sched, X86FoldableSchedWrite varsched> { @@ -7330,7 +7378,7 @@ (ins RC:$src1, x86memop_i:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, - (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V, + (i_vt (load addr:$src2)))))]>, VEX_4V, Sched<[varsched.Folded, sched.ReadAfterFold]>; def ri : AVXAIi8; defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, - loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM, + v8f32, v8i32, SchedWriteFShuffle.YMM, SchedWriteFVarShuffle.YMM>, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, - loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM, + v2f64, v2i64, SchedWriteFShuffle.XMM, SchedWriteFVarShuffle.XMM>; defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, - loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM, + v4f64, v4i64, SchedWriteFShuffle.YMM, SchedWriteFVarShuffle.YMM>, VEX_L; } @@ -7441,8 +7489,7 @@ let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (X86cvtph2ps (bc_v8i16 - (loadv2i64 addr:$src))))]>, + [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>, T8PD, VEX, Sched<[sched.Folded]>; } @@ -7516,7 +7563,7 @@ /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate multiclass AVX2_blend_rmi opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, X86FoldableSchedWrite sched, - RegisterClass RC, PatFrag memop_frag, + RegisterClass RC, X86MemOperand x86memop, SDNodeXForm commuteXForm> { let isCommutable = 1 in def rri : AVX2AIi8, + (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; // Pattern to commute if load is in first source. - def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), - RC:$src1, imm:$src3)), + def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)), (!cast(NAME#"rmi") RC:$src1, addr:$src2, (commuteXForm imm:$src3))>; } defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, - SchedWriteBlend.XMM, VR128, loadv2i64, i128mem, + SchedWriteBlend.XMM, VR128, i128mem, BlendCommuteImm4>; defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, - SchedWriteBlend.YMM, VR256, loadv4i64, i256mem, + SchedWriteBlend.YMM, VR256, i256mem, BlendCommuteImm8>, VEX_L; // For insertion into the zero index (low half) of a 256-bit vector, it is @@ -7779,7 +7824,7 @@ // VPERM - Permute instructions // -multiclass avx2_perm opc, string OpcodeStr, PatFrag mem_frag, +multiclass avx2_perm opc, string OpcodeStr, ValueType OpVT, X86FoldableSchedWrite Sched, X86MemOperand memOp> { let Predicates = [HasAVX2, NoVLX] in { @@ -7796,16 +7841,14 @@ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, - (bitconvert (mem_frag addr:$src2)))))]>, + (load addr:$src2))))]>, Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L; } } -defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256, - i256mem>; +defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; let ExeDomain = SSEPackedSingle in -defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256, - f256mem>; +defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; multiclass avx2_perm_imm opc, string OpcodeStr, PatFrag mem_frag, ValueType OpVT, X86FoldableSchedWrite Sched, @@ -7875,9 +7918,9 @@ let Predicates = [HasAVX2, NoVLX] in { defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; - defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>; - defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>; - defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>; + defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>; + defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>; + defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>; } //===----------------------------------------------------------------------===// @@ -8036,7 +8079,7 @@ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, - (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, + (vt128 (load addr:$src2)))))]>, VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, SchedWriteVarVecShift.XMM.ReadAfterFold]>; def Yrr : AVX28I, + (vt256 (load addr:$src2)))))]>, VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, SchedWriteVarVecShift.YMM.ReadAfterFold]>; } @@ -8064,13 +8107,11 @@ def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)), (VPSRAVDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (X86vsrav VR128:$src1, - (bitconvert (loadv2i64 addr:$src2)))), + def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))), (VPSRAVDrm VR128:$src1, addr:$src2)>; def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)), (VPSRAVDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86vsrav VR256:$src1, - (bitconvert (loadv4i64 addr:$src2)))), + def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))), (VPSRAVDYrm VR256:$src1, addr:$src2)>; } @@ -8152,7 +8193,7 @@ def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, - (bitconvert (MemOpFrag addr:$src2)))))]>, + (MemOpFrag addr:$src2))))]>, Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD; } } @@ -8170,7 +8211,7 @@ def rmi : Ii8, Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; } @@ -8180,24 +8221,24 @@ let Constraints = "$src1 = $dst", Predicates = [HasGFNI, UseSSE2] in defm NAME : GF2P8AFFINE_rmi; + VR128, load, i128mem, 1>; let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { defm V##NAME : GF2P8AFFINE_rmi, VEX_4V, VEX_W; + load, i128mem>, VEX_4V, VEX_W; defm V##NAME##Y : GF2P8AFFINE_rmi, VEX_4V, VEX_L, VEX_W; + load, i256mem>, VEX_4V, VEX_L, VEX_W; } } // GF2P8MULB let Constraints = "$src1 = $dst", Predicates = [HasGFNI, UseSSE2] in -defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64, +defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, i128mem, 1>; let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { - defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64, + defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, i128mem>, VEX_4V; - defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64, + defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, i256mem>, VEX_4V, VEX_L; } // GF2P8AFFINEINVQB, GF2P8AFFINEQB Index: lib/Target/X86/X86InstrXOP.td =================================================================== --- lib/Target/X86/X86InstrXOP.td +++ lib/Target/X86/X86InstrXOP.td @@ -11,32 +11,32 @@ // //===----------------------------------------------------------------------===// -multiclass xop2op opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { +multiclass xop2op opc, string OpcodeStr, Intrinsic Int> { def rr : IXOP, XOP, Sched<[SchedWritePHAdd.XMM]>; def rm : IXOP, XOP, + [(set VR128:$dst, (Int (load addr:$src)))]>, XOP, Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>; } let ExeDomain = SSEPackedInt in { - defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>; - defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>; - defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>; - defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>; - defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>; - defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>; - defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>; - defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>; - defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>; - defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>; - defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>; - defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>; - defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>; - defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>; - defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>; + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd>; } // Scalar load 2 addr operand instructions @@ -48,47 +48,47 @@ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>; def rm : IXOP, XOP, + [(set VR128:$dst, (Int mem_cpat:$src))]>, XOP, Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass xop2op128 opc, string OpcodeStr, Intrinsic Int, - PatFrag memop, X86FoldableSchedWrite sched> { + X86FoldableSchedWrite sched> { def rr : IXOP, XOP, Sched<[sched]>; def rm : IXOP, XOP, + [(set VR128:$dst, (Int (load addr:$src)))]>, XOP, Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass xop2op256 opc, string OpcodeStr, Intrinsic Int, - PatFrag memop, X86FoldableSchedWrite sched> { + X86FoldableSchedWrite sched> { def Yrr : IXOP, XOP, VEX_L, Sched<[sched]>; def Yrm : IXOP, XOP, VEX_L, + [(set VR256:$dst, (Int (load addr:$src)))]>, XOP, VEX_L, Sched<[sched.Folded, sched.ReadAfterFold]>; } let ExeDomain = SSEPackedSingle in { defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, ssmem, sse_load_f32, SchedWriteFRnd.Scl>; - defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32, + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, SchedWriteFRnd.XMM>; - defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32, + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, SchedWriteFRnd.YMM>; } let ExeDomain = SSEPackedDouble in { defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, sdmem, sse_load_f64, SchedWriteFRnd.Scl>; - defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64, + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, SchedWriteFRnd.XMM>; - defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64, + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, SchedWriteFRnd.YMM>; } @@ -105,13 +105,13 @@ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), - (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, + (vt128 (load addr:$src2)))))]>, XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold]>; def mr : IXOP, XOP, Sched<[sched.Folded, sched.ReadAfterFold]>; // For disassembler @@ -150,7 +150,7 @@ (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, + (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>, XOP, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -181,7 +181,7 @@ !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), + (Int VR128:$src1, (load addr:$src2), VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -260,7 +260,7 @@ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), - (vt128 (bitconvert (loadv2i64 addr:$src2))), + (vt128 (load addr:$src2)), imm:$cc)))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; let isAsmParserOnly = 1, hasSideEffects = 0 in { @@ -279,7 +279,7 @@ } } - def : Pat<(OpNode (bitconvert (loadv2i64 addr:$src2)), + def : Pat<(OpNode (load addr:$src2), (vt128 VR128:$src1), imm:$cc), (!cast(NAME#"mi") VR128:$src1, addr:$src2, (CommuteVPCOMCC imm:$cc))>; @@ -310,14 +310,14 @@ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), - (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>, + (vt128 (load addr:$src3)))))]>, XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; def rmr : IXOPi8Reg, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold, // 128mem:$src2 @@ -401,8 +401,7 @@ !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set RC:$dst, - (VT (X86vpermil2 RC:$src1, RC:$src2, - (bitconvert (IntLdFrag addr:$src3)), + (VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3), (i8 imm:$src4))))]>, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; def mr : IXOP5; defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem, - v8f32, loadv8f32, loadv4i64, + v8f32, loadv8f32, loadv8i32, SchedWriteFVarShuffle.YMM>, VEX_L; } Index: test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- test/CodeGen/X86/avx-vperm2x128.ll +++ test/CodeGen/X86/avx-vperm2x128.ll @@ -224,7 +224,7 @@ define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: shuffle_v16i16_4501_mem: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -1630,7 +1630,7 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] @@ -1654,19 +1654,19 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, 64(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, 64(%rdi) +; AVX2-FAST-NEXT: vmovups %ymm4, 32(%rdi) ; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq Index: test/CodeGen/X86/pshufb-mask-comments.ll =================================================================== --- test/CodeGen/X86/pshufb-mask-comments.ll +++ test/CodeGen/X86/pshufb-mask-comments.ll @@ -57,9 +57,9 @@ ; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: movdqa %xmm1, (%rax) -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1] -; CHECK-NEXT: movdqa %xmm1, (%rax) -; CHECK-NEXT: pshufb %xmm1, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,1] +; CHECK-NEXT: movaps %xmm1, (%rax) +; CHECK-NEXT: pshufb (%rax), %xmm0 ; CHECK-NEXT: retq store <2 x i64> , <2 x i64>* undef, align 16 %l = load <2 x i64>, <2 x i64>* undef, align 16 Index: test/CodeGen/X86/vector-extend-inreg.ll =================================================================== --- test/CodeGen/X86/vector-extend-inreg.ll +++ test/CodeGen/X86/vector-extend-inreg.ll @@ -13,6 +13,7 @@ ; X32-SSE-NEXT: subl $384, %esp # imm = 0x180 ; X32-SSE-NEXT: movl 88(%ebp), %ecx ; X32-SSE-NEXT: movdqa 72(%ebp), %xmm0 +; X32-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; X32-SSE-NEXT: xorps %xmm1, %xmm1 ; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) @@ -21,7 +22,6 @@ ; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X32-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; X32-SSE-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) ; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) Index: test/CodeGen/X86/vector-idiv-v2i32.ll =================================================================== --- test/CodeGen/X86/vector-idiv-v2i32.ll +++ test/CodeGen/X86/vector-idiv-v2i32.ll @@ -693,20 +693,20 @@ ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-NEXT: movdqa {{.*#+}} xmm3 = [31,0,31,0] -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: psrlq %xmm3, %xmm4 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [31,0,31,0] +; X86-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-NEXT: movdqa %xmm3, %xmm4 +; X86-NEXT: psrlq %xmm2, %xmm4 ; X86-NEXT: movl $31, %ecx ; X86-NEXT: movd %ecx, %xmm5 -; X86-NEXT: psrlq %xmm5, %xmm2 -; X86-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; X86-NEXT: psrlq %xmm5, %xmm3 +; X86-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] ; X86-NEXT: movdqa %xmm1, %xmm4 -; X86-NEXT: psrlq %xmm3, %xmm4 +; X86-NEXT: psrlq %xmm2, %xmm4 ; X86-NEXT: psrlq %xmm5, %xmm1 ; X86-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; X86-NEXT: xorpd %xmm2, %xmm1 -; X86-NEXT: psubq %xmm2, %xmm1 +; X86-NEXT: xorpd %xmm3, %xmm1 +; X86-NEXT: psubq %xmm3, %xmm1 ; X86-NEXT: pand {{\.LCPI.*}}, %xmm1 ; X86-NEXT: psrlq $29, %xmm1 ; X86-NEXT: paddq %xmm0, %xmm1 Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -1923,7 +1923,8 @@ ; ; AVX2-SLOW-LABEL: PR32160: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq Index: test/CodeGen/X86/widened-broadcast.ll =================================================================== --- test/CodeGen/X86/widened-broadcast.ll +++ test/CodeGen/X86/widened-broadcast.ll @@ -121,10 +121,21 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_8i32_4i32_01010101: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_8i32_4i32_01010101: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_8i32_4i32_01010101: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_8i32_4i32_01010101: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <4 x i32>, <4 x i32>* %ptr %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> @@ -138,21 +149,10 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_8i32_8i32_01010101: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_8i32_8i32_01010101: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_8i32_8i32_01010101: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_8i32_8i32_01010101: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq entry: %ld = load <8 x i32>, <8 x i32>* %ptr %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> @@ -246,10 +246,21 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_16i16_8i16_0123012301230123: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_16i16_8i16_0123012301230123: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> @@ -263,21 +274,10 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_16i16_16i16_0101010101010101: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_16i16_16i16_0101010101010101: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_16i16_16i16_0101010101010101: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: retq entry: %ld = load <16 x i16>, <16 x i16>* %ptr %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> @@ -446,10 +446,21 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32>