Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -253,6 +253,23 @@ MaskingConstraint, NoItinerary, IsCommutable, IsKCommutable>; +// Similar to AVX512_maskable_common, but with scalar types. +multiclass AVX512_maskable_fp_common O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + SDNode Select = vselect, + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0, + bit IsKCommutable = 0> : + AVX512_maskable_custom; + // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the // perserved vector elements come from a new dummy input operand tied to $dst. @@ -285,6 +302,18 @@ (X86selects _.KRCWM:$mask, RHS, _.RC:$src0), X86selects, "$src0 = $dst", itin, IsCommutable>; +// Similar to AVX512_maskable_scalar, but with scalar types. +multiclass AVX512_maskable_fp_scalar O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_fp_common; + // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved // vector elements. NOTE that the NonTiedIns (the ins dag) should exclude @@ -5984,45 +6013,73 @@ //===----------------------------------------------------------------------===// multiclass avx512_cvt_fp_scalar opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNode> { - defm rr : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; - defm rm : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; + } + + defm rr : AVX512_maskable_fp_scalar, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + let mayLoad = 1 in + defm rm : AVX512_maskable_fp_scalar, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; + } // Scalar Coversion with SAE - suppress all exceptions multiclass avx512_cvt_fp_sae_scalar opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeRnd> { - defm rrb : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, EVEX_B; + } + + defm rrb : AVX512_maskable_fp_scalar, + EVEX_4V, VEX_LIG, EVEX_B; } // Scalar Conversion with rounding control (RC) multiclass avx512_cvt_fp_rc_scalar opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeRnd> { - defm rrb : AVX512_maskable_scalar, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_B, EVEX_RC; + } + + defm rrb : AVX512_maskable_fp_scalar, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, + EVEX_B, EVEX_RC; } multiclass avx512_cvt_fp_scalar_sd2ss opc, string OpcodeStr, SDNode OpNodeRnd, X86VectorVTInfo _src, @@ -6049,39 +6106,40 @@ X86fpextRnd,f32x_info, f64x_info >; def : Pat<(f64 (fpextend FR32X:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), - (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, + (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>, Requires<[HasAVX512]>; def : Pat<(f64 (fpextend (loadf32 addr:$src))), - (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>; def : Pat<(f64 (extloadf32 addr:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>; def : Pat<(f64 (extloadf32 addr:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, + (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>, Requires<[HasAVX512, OptForSpeed]>; def : Pat<(f32 (fpround FR64X:$src)), - (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), - (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, + (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>, Requires<[HasAVX512]>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))), - (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>, + (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS VR128X:$dst, FR32X), + (COPY_TO_REGCLASS VR128X:$src, FR64X)), + VR128X)>, Requires<[HasAVX512]>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))), - (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>, + (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS VR128X:$dst, FR64X), + (COPY_TO_REGCLASS VR128X:$src, FR32X)), + VR128X)>, Requires<[HasAVX512]>; //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -1724,20 +1724,21 @@ // Convert scalar double to scalar single let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), - (ins FR64:$src1, FR64:$src2), + (ins FR32:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; let mayLoad = 1 in def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), - (ins FR64:$src1, f64mem:$src2), + (ins FR32:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; } -def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, +def : Pat<(f32 (fpround FR64:$src)), + (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>, Requires<[UseAVX]>; def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), @@ -1789,14 +1790,14 @@ // SSE2 instructions with XS prefix let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), - (ins FR32:$src1, FR32:$src2), + (ins FR64:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; let mayLoad = 1 in def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), - (ins FR32:$src1, f32mem:$src2), + (ins FR64:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, @@ -1804,15 +1805,15 @@ } def : Pat<(f64 (fpextend FR32:$src)), - (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>; + (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>; def : Pat<(fpextend (loadf32 addr:$src)), - (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; + (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, + (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, + (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, Requires<[UseAVX, OptForSpeed]>; def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), Index: test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- test/CodeGen/X86/vector-half-conversions.ll +++ test/CodeGen/X86/vector-half-conversions.ll @@ -1941,25 +1941,25 @@ ; AVX1-LABEL: cvt_8i16_to_8f64: ; AVX1: # BB#0: ; AVX1-NEXT: vmovq %xmm0, %rdx -; AVX1-NEXT: movq %rdx, %r9 +; AVX1-NEXT: movq %rdx, %r8 ; AVX1-NEXT: movl %edx, %r10d -; AVX1-NEXT: movswl %dx, %r8d +; AVX1-NEXT: movswl %dx, %r9d ; AVX1-NEXT: shrq $48, %rdx -; AVX1-NEXT: shrq $32, %r9 +; AVX1-NEXT: shrq $32, %r8 ; AVX1-NEXT: shrl $16, %r10d ; AVX1-NEXT: vpextrq $1, %xmm0, %rdi -; AVX1-NEXT: movq %rdi, %rsi -; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: movl %edi, %esi ; AVX1-NEXT: movswl %di, %ecx ; AVX1-NEXT: shrq $48, %rdi -; AVX1-NEXT: shrq $32, %rsi -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: shrq $32, %rax +; AVX1-NEXT: shrl $16, %esi +; AVX1-NEXT: movswl %si, %esi +; AVX1-NEXT: vmovd %esi, %xmm0 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 ; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 -; AVX1-NEXT: movswl %si, %eax +; AVX1-NEXT: cwtl ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 ; AVX1-NEXT: movswl %di, %eax @@ -1968,9 +1968,9 @@ ; AVX1-NEXT: movswl %r10w, %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vmovd %r8d, %xmm5 +; AVX1-NEXT: vmovd %r9d, %xmm5 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: movswl %r9w, %eax +; AVX1-NEXT: movswl %r8w, %eax ; AVX1-NEXT: vmovd %eax, %xmm6 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX1-NEXT: movswl %dx, %eax @@ -1995,25 +1995,25 @@ ; AVX2-LABEL: cvt_8i16_to_8f64: ; AVX2: # BB#0: ; AVX2-NEXT: vmovq %xmm0, %rdx -; AVX2-NEXT: movq %rdx, %r9 +; AVX2-NEXT: movq %rdx, %r8 ; AVX2-NEXT: movl %edx, %r10d -; AVX2-NEXT: movswl %dx, %r8d +; AVX2-NEXT: movswl %dx, %r9d ; AVX2-NEXT: shrq $48, %rdx -; AVX2-NEXT: shrq $32, %r9 +; AVX2-NEXT: shrq $32, %r8 ; AVX2-NEXT: shrl $16, %r10d ; AVX2-NEXT: vpextrq $1, %xmm0, %rdi -; AVX2-NEXT: movq %rdi, %rsi -; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movl %edi, %esi ; AVX2-NEXT: movswl %di, %ecx ; AVX2-NEXT: shrq $48, %rdi -; AVX2-NEXT: shrq $32, %rsi -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: shrq $32, %rax +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: movswl %si, %esi +; AVX2-NEXT: vmovd %esi, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 ; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 -; AVX2-NEXT: movswl %si, %eax +; AVX2-NEXT: cwtl ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 ; AVX2-NEXT: movswl %di, %eax @@ -2022,9 +2022,9 @@ ; AVX2-NEXT: movswl %r10w, %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vmovd %r8d, %xmm5 +; AVX2-NEXT: vmovd %r9d, %xmm5 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: movswl %r9w, %eax +; AVX2-NEXT: movswl %r8w, %eax ; AVX2-NEXT: vmovd %eax, %xmm6 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX2-NEXT: movswl %dx, %eax