Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3690,353 +3690,6 @@ (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v32i8 VR256X:$src), addr:$dst), (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>; - - // Special patterns for storing subvector extracts of lower 128-bits of 256. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - def : Pat<(alignedstore (v2f64 (extract_subvector - (v4f64 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4f32 (extract_subvector - (v8f32 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v2i64 (extract_subvector - (v4i64 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4i32 (extract_subvector - (v8i32 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v8i16 (extract_subvector - (v16i16 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v16i8 (extract_subvector - (v32i8 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - - def : Pat<(store (v2f64 (extract_subvector - (v4f64 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v4f32 (extract_subvector - (v8f32 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v2i64 (extract_subvector - (v4i64 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v4i32 (extract_subvector - (v8i32 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v8i16 (extract_subvector - (v16i16 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v16i8 (extract_subvector - (v32i8 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - - // Special patterns for storing subvector extracts of lower 128-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - def : Pat<(alignedstore (v2f64 (extract_subvector - (v8f64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4f32 (extract_subvector - (v16f32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v2i64 (extract_subvector - (v8i64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4i32 (extract_subvector - (v16i32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v8i16 (extract_subvector - (v32i16 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v16i8 (extract_subvector - (v64i8 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - - def : Pat<(store (v2f64 (extract_subvector - (v8f64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v4f32 (extract_subvector - (v16f32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v2i64 (extract_subvector - (v8i64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v4i32 (extract_subvector - (v16i32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v8i16 (extract_subvector - (v32i16 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v16i8 (extract_subvector - (v64i8 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - - // Special patterns for storing subvector extracts of lower 256-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - def : Pat<(alignedstore (v4f64 (extract_subvector - (v8f64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore (v8f32 (extract_subvector - (v16f32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore (v4i64 (extract_subvector - (v8i64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore (v8i32 (extract_subvector - (v16i32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore (v16i16 (extract_subvector - (v32i16 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore (v32i8 (extract_subvector - (v64i8 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - - def : Pat<(store (v4f64 (extract_subvector - (v8f64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVUPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v8f32 (extract_subvector - (v16f32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVUPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v4i64 (extract_subvector - (v8i64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v8i32 (extract_subvector - (v16i32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v16i16 (extract_subvector - (v32i16 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v32i8 (extract_subvector - (v64i8 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - - // If we're inserting into an all zeros vector, just use a plain move which - // will zero the upper bits. - // TODO: Is there a safe way to detect whether the producing instruction - // already zeroed the upper bits? - - // 128->256 register form. - def : Pat<(v4f64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v2f64 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDZ128rr VR128:$src), sub_xmm)>; - def : Pat<(v8f32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v4f32 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSZ128rr VR128:$src), sub_xmm)>; - def : Pat<(v4i64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v2i64 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128:$src), sub_xmm)>; - def : Pat<(v8i32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v4i32 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128:$src), sub_xmm)>; - def : Pat<(v16i16 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v8i16 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128:$src), sub_xmm)>; - def : Pat<(v32i8 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v16i8 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128:$src), sub_xmm)>; - - // 128->256 memory form. - def : Pat<(v4f64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (loadv2f64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDZ128rm addr:$src), sub_xmm)>; - def : Pat<(v8f32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (loadv4f32 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSZ128rm addr:$src), sub_xmm)>; - def : Pat<(v4i64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (loadv2i64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; - def : Pat<(v8i32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (bc_v4i32 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; - def : Pat<(v16i16 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (bc_v8i16 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; - def : Pat<(v32i8 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (bc_v16i8 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; - - // 128->512 register form. - def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v2f64 VR128X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDZ128rr VR128X:$src), sub_xmm)>; - def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v4f32 VR128X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSZ128rr VR128X:$src), sub_xmm)>; - def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v2i64 VR128X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128X:$src), sub_xmm)>; - def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v4i32 VR128X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128X:$src), sub_xmm)>; - def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v8i16 VR128X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128X:$src), sub_xmm)>; - def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v16i8 VR128X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rr VR128X:$src), sub_xmm)>; - - // 128->512 memory form. - def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv2f64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDZ128rm addr:$src), sub_xmm)>; - def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv4f32 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSZ128rm addr:$src), sub_xmm)>; - def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv2i64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; - def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v4i32 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; - def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v8i16 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; - def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v16i8 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z128rm addr:$src), sub_xmm)>; - - // 256->512 register form. - def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v4f64 VR256X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDZ256rr VR256X:$src), sub_ymm)>; - def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v8f32 VR256X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSZ256rr VR256X:$src), sub_ymm)>; - def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v4i64 VR256X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rr VR256X:$src), sub_ymm)>; - def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v8i32 VR256X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rr VR256X:$src), sub_ymm)>; - def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v16i16 VR256X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rr VR256X:$src), sub_ymm)>; - def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v32i8 VR256X:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rr VR256X:$src), sub_ymm)>; - - // 256->512 memory form. - def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv4f64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDZ256rm addr:$src), sub_ymm)>; - def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv8f32 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSZ256rm addr:$src), sub_ymm)>; - def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv4i64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rm addr:$src), sub_ymm)>; - def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v8i32 (loadv4i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rm addr:$src), sub_ymm)>; - def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v16i16 (loadv4i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rm addr:$src), sub_ymm)>; - def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v32i8 (loadv4i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQA64Z256rm addr:$src), sub_ymm)>; -} - -let Predicates = [HasAVX512, NoVLX] in { - // If we're inserting into an all zeros vector, just use a plain move which - // will zero the upper bits. - // TODO: Is there a safe way to detect whether the producing instruction - // already zeroed the upper bits? - - // 128->512 register form. - def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v2f64 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDrr VR128:$src), sub_xmm)>; - def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v4f32 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSrr VR128:$src), sub_xmm)>; - def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v2i64 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; - def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v4i32 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; - def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v8i16 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; - def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v16i8 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; - - // 128->512 memory form. - def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv2f64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDrm addr:$src), sub_xmm)>; - def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv4f32 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSrm addr:$src), sub_xmm)>; - def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv2i64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; - def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v4i32 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; - def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v8i16 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; - def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v16i8 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; - - // 256->512 register form. - def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v4f64 VR256:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDYrr VR256:$src), sub_ymm)>; - def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v8f32 VR256:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSYrr VR256:$src), sub_ymm)>; - def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v4i64 VR256:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQAYrr VR256:$src), sub_ymm)>; - def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v8i32 VR256:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQAYrr VR256:$src), sub_ymm)>; - def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v16i16 VR256:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQAYrr VR256:$src), sub_ymm)>; - def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (v32i8 VR256:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQAYrr VR256:$src), sub_ymm)>; - - // 256->512 memory form. - def : Pat<(v8f64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv4f64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDYrm addr:$src), sub_ymm)>; - def : Pat<(v16f32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv8f32 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSYrm addr:$src), sub_ymm)>; - def : Pat<(v8i64 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (loadv4i64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQAYrm addr:$src), sub_ymm)>; - def : Pat<(v16i32 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v8i32 (loadv4i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQAYrm addr:$src), sub_ymm)>; - def : Pat<(v32i16 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v16i16 (loadv4i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQAYrm addr:$src), sub_ymm)>; - def : Pat<(v64i8 (insert_subvector (bitconvert (v16i32 immAllZerosV)), - (bc_v32i8 (loadv4i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQAYrm addr:$src), sub_ymm)>; } multiclass masked_move_for_extract; def : Pat<(store (v32i8 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; - - // Special patterns for storing subvector extracts of lower 128-bits - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - def : Pat<(alignedstore (v2f64 (extract_subvector - (v4f64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4f32 (extract_subvector - (v8f32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - - def : Pat<(store (v2f64 (extract_subvector - (v4f64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v4f32 (extract_subvector - (v8f32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; } // Use movaps / movups for SSE integer load / store (one byte shorter). @@ -3756,79 +3740,6 @@ (VMOVDQUmr addr:$dst, VR128:$src)>; def : Pat<(store (v16i8 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; - - // Special patterns for storing subvector extracts of lower 128-bits - // Its cheaper to just use VMOVDQA/VMOVDQU instead of VEXTRACTF128mr - def : Pat<(alignedstore (v2i64 (extract_subvector - (v4i64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQAmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4i32 (extract_subvector - (v8i32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQAmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v8i16 (extract_subvector - (v16i16 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQAmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v16i8 (extract_subvector - (v32i8 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQAmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - - def : Pat<(store (v2i64 (extract_subvector - (v4i64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQUmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v4i32 (extract_subvector - (v8i32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQUmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v8i16 (extract_subvector - (v16i16 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQUmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v16i8 (extract_subvector - (v32i8 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQUmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - - // If we're inserting into an all zeros vector, just use a plain move which - // will zero the upper bits. - // TODO: Is there a safe way to detect whether the producing instruction - // already zeroed the upper bits? - def : Pat<(v4f64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v2f64 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDrr VR128:$src), sub_xmm)>; - def : Pat<(v8f32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v4f32 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSrr VR128:$src), sub_xmm)>; - def : Pat<(v4i64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v2i64 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; - def : Pat<(v8i32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v4i32 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; - def : Pat<(v16i16 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v8i16 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; - def : Pat<(v32i8 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (v16i8 VR128:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArr VR128:$src), sub_xmm)>; - - def : Pat<(v4f64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (loadv2f64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPDrm addr:$src), sub_xmm)>; - def : Pat<(v8f32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (loadv4f32 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVAPSrm addr:$src), sub_xmm)>; - def : Pat<(v4i64 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (loadv2i64 addr:$src), (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; - def : Pat<(v8i32 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (bc_v4i32 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; - def : Pat<(v16i16 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (bc_v8i16 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; - def : Pat<(v32i8 (insert_subvector (bitconvert (v8i32 immAllZerosV)), - (bc_v16i8 (loadv2i64 addr:$src)), - (iPTR 0))), - (SUBREG_TO_REG (i64 0), (VMOVDQArm addr:$src), sub_xmm)>; } //===---------------------------------------------------------------------===// Index: lib/Target/X86/X86InstrVecCompiler.td =================================================================== --- lib/Target/X86/X86InstrVecCompiler.td +++ lib/Target/X86/X86InstrVecCompiler.td @@ -30,6 +30,10 @@ (COPY_TO_REGCLASS FR64:$src, VR128)>; +//===----------------------------------------------------------------------===// +// Subvector tricks +//===----------------------------------------------------------------------===// + // Patterns for insert_subvector/extract_subvector to/from index=0 multiclass subvector_subreg_lowering; +multiclass subvector_store_lowering { + def : Pat<(alignedstore (DstTy (extract_subvector + (SrcTy RC:$src), (iPTR 0))), addr:$dst), + (!cast("VMOV"#AlignedStr#"mr") addr:$dst, + (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; + + def : Pat<(store (DstTy (extract_subvector + (SrcTy RC:$src), (iPTR 0))), addr:$dst), + (!cast("VMOV"#UnalignedStr#"mr") addr:$dst, + (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>; + defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>; +} + +let Predicates = [HasVLX] in { + // Special patterns for storing subvector extracts of lower 128-bits + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64, + sub_xmm>; + defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32, + sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v2i64, + v4i64, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v4i32, + v8i32, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v8i16, + v16i16, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v16i8, + v32i8, sub_xmm>; + + // Special patterns for storing subvector extracts of lower 128-bits of 512. + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64, + sub_xmm>; + defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32, + sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v2i64, + v8i64, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v4i32, + v16i32, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v8i16, + v32i16, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v16i8, + v64i8, sub_xmm>; + + // Special patterns for storing subvector extracts of lower 256-bits of 512. + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64, + sub_ymm>; + defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32, + sub_ymm>; + defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v4i64, + v8i64, sub_ymm>; + defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v8i32, + v16i32, sub_ymm>; + defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v16i16, + v32i16, sub_ymm>; + defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v32i8, + v64i8, sub_ymm>; +} + +// If we're inserting into an all zeros vector, just use a plain move which +// will zero the upper bits. +// TODO: Is there a safe way to detect whether the producing instruction +// already zeroed the upper bits? +multiclass subvector_zero_lowering { + def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + (SrcTy RC:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), + (!cast("VMOV"#MoveStr#"rr") RC:$src), SubIdx)>; + + def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + (SrcTy (bitconvert (memop addr:$src))), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), + (!cast("VMOV"#MoveStr#"rm") addr:$src), SubIdx)>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm : subvector_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, loadv2f64, + sub_xmm>; + defm : subvector_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, loadv4f32, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, loadv2i64, + sub_xmm>; +} + +let Predicates = [HasVLX] in { + defm : subvector_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, + loadv2f64, sub_xmm>; + defm : subvector_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, + loadv4f32, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, + loadv2i64, sub_xmm>; + + defm : subvector_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, + loadv2f64, sub_xmm>; + defm : subvector_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, + loadv4f32, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, + loadv2i64, sub_xmm>; + + defm : subvector_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, + loadv4f64, sub_ymm>; + defm : subvector_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, + loadv8f32, sub_ymm>; + defm : subvector_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, + loadv4i64, sub_ymm>; +} + +let Predicates = [HasAVX512, NoVLX] in { + defm : subvector_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, loadv2f64, + sub_xmm>; + defm : subvector_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, loadv4f32, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, loadv2i64, + sub_xmm>; + + defm : subvector_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, + loadv4f64, sub_ymm>; + defm : subvector_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, + loadv8f32, sub_ymm>; + defm : subvector_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, + loadv4i64, sub_ymm>; +} + +//===----------------------------------------------------------------------===// +// No op bitconverts +//===----------------------------------------------------------------------===// + // Bitcasts between 128-bit vector types. Return the original type since // no instruction is needed for the conversion def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;