Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -592,6 +592,8 @@ { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE }, { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 }, { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 }, + { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, + { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE }, @@ -1199,8 +1201,6 @@ { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, - { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, - { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE }, { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE }, { X86::SBB32rr, X86::SBB32rm, 0 }, Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -6268,10 +6268,10 @@ // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// -multiclass sse41_fp_unop_rm opcps, bits<8> opcpd, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - PatFrag mem_frag32, PatFrag mem_frag64, - Intrinsic V4F32Int, Intrinsic V2F64Int> { +multiclass sse41_fp_unop_p opcps, bits<8> opcpd, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + PatFrag mem_frag32, PatFrag mem_frag64, + Intrinsic V4F32Int, Intrinsic V2F64Int> { let ExeDomain = SSEPackedSingle in { // Intrinsic operation, reg. // Vector intrinsic operation, reg @@ -6312,35 +6312,73 @@ } // ExeDomain = SSEPackedDouble } -multiclass sse41_fp_binop_rm opcss, bits<8> opcsd, - string OpcodeStr, - Intrinsic F32Int, - Intrinsic F64Int, bit Is2Addr = 1> { -let ExeDomain = GenericDomain in { - // Operation, reg. - let hasSideEffects = 0 in +multiclass avx_fp_unop_rm opcss, bits<8> opcsd, + string OpcodeStr> { +let ExeDomain = GenericDomain, hasSideEffects = 0 in { def SSr : SS4AIi8, Sched<[WriteFAdd]>; - // Operation, mem. - let mayLoad = 1, hasSideEffects = 0 in + let mayLoad = 1 in def SSm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in + def SDr : SS4AIi8, Sched<[WriteFAdd]>; + + let mayLoad = 1 in + def SDm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = GenericDomain, hasSideEffects = 0 +} + +multiclass sse41_fp_unop_s opcss, bits<8> opcsd, + string OpcodeStr> { +let ExeDomain = GenericDomain, hasSideEffects = 0 in { + def SSr : SS4AIi8, Sched<[WriteFAdd]>; + + let mayLoad = 1 in + def SSm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; + + def SDr : SS4AIi8, Sched<[WriteFAdd]>; + + let mayLoad = 1 in + def SDm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = GenericDomain, hasSideEffects = 0 +} + +multiclass sse41_fp_binop_s opcss, bits<8> opcsd, + string OpcodeStr, + Intrinsic F32Int, + Intrinsic F64Int, bit Is2Addr = 1> { +let ExeDomain = GenericDomain, isCodeGenOnly = 1 in { def SSr_Int : SS4AIi8, Sched<[WriteFAdd]>; - // Intrinsic operation, mem. - let isCodeGenOnly = 1 in def SSm_Int : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; - // Operation, reg. - let hasSideEffects = 0 in - def SDr : SS4AIi8, Sched<[WriteFAdd]>; - - // Operation, mem. - let mayLoad = 1, hasSideEffects = 0 in - def SDm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; - - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in def SDr_Int : SS4AIi8, Sched<[WriteFAdd]>; - // Intrinsic operation, mem. - let isCodeGenOnly = 1 in def SDm_Int : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; -} // ExeDomain = GenericDomain +} // ExeDomain = GenericDomain, isCodeGenOnly = 1 } // FP round - roundss, roundps, roundsd, roundpd let Predicates = [HasAVX] in { // Intrinsic form - defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, - loadv4f32, loadv2f64, - int_x86_sse41_round_ps, - int_x86_sse41_round_pd>, VEX; - defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, - loadv8f32, loadv4f64, - int_x86_avx_round_ps_256, - int_x86_avx_round_pd_256>, VEX, VEX_L; - defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", - int_x86_sse41_round_ss, - int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, + loadv4f32, loadv2f64, + int_x86_sse41_round_ps, + int_x86_sse41_round_pd>, VEX; + defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, + loadv8f32, loadv4f64, + int_x86_avx_round_ps_256, + int_x86_avx_round_pd_256>, VEX, VEX_L; + defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", + int_x86_sse41_round_ss, + int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG; } let Predicates = [UseAVX] in { @@ -6498,34 +6509,37 @@ (VROUNDYPDr VR256:$src, (i32 0xB))>; } -defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, - memopv4f32, memopv2f64, - int_x86_sse41_round_ps, int_x86_sse41_round_pd>; +defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128, + memopv4f32, memopv2f64, int_x86_sse41_round_ps, + int_x86_sse41_round_pd>; + +defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">; + let Constraints = "$src1 = $dst" in -defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", +defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", int_x86_sse41_round_ss, int_x86_sse41_round_sd>; let Predicates = [UseSSE41] in { def : Pat<(ffloor FR32:$src), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; + (ROUNDSSr FR32:$src, (i32 0x9))>; def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; + (ROUNDSDr FR64:$src, (i32 0x9))>; def : Pat<(f32 (fnearbyint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + (ROUNDSSr FR32:$src, (i32 0xC))>; def : Pat<(f64 (fnearbyint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + (ROUNDSDr FR64:$src, (i32 0xC))>; def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; + (ROUNDSSr FR32:$src, (i32 0xA))>; def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; + (ROUNDSDr FR64:$src, (i32 0xA))>; def : Pat<(f32 (frint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + (ROUNDSSr FR32:$src, (i32 0x4))>; def : Pat<(f64 (frint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + (ROUNDSDr FR64:$src, (i32 0x4))>; def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; + (ROUNDSSr FR32:$src, (i32 0xB))>; def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; + (ROUNDSDr FR64:$src, (i32 0xB))>; def : Pat<(v4f32 (ffloor VR128:$src)), (ROUNDPSr VR128:$src, (i32 0x9))>; Index: test/CodeGen/X86/pr31143.ll =================================================================== --- test/CodeGen/X86/pr31143.ll +++ test/CodeGen/X86/pr31143.ll @@ -0,0 +1,30 @@ +; RUN: llc -mtriple=x86_64-pc-linux-gnu -mattr=+sse4.2 < %s | FileCheck %s + +; CHECK-LABEL: test: +; CHECK: movss {{.*}}, %[[XMM0:xmm[0-9]+]] +; CHECK: xorps %[[XMM1:xmm[0-9]+]], %[[XMM1]] +; CHECK: roundss $9, %[[XMM0]], %[[XMM1]] + +define void @test(float* nocapture %a, <4 x float>* nocapture %b, i32 %k) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %v = load float, float* %arrayidx, align 4 + %floor = call float @floorf(float %v) + %sub = fsub float %floor, %v + %v1 = insertelement <4 x float> undef, float %sub, i32 0 + %br = shufflevector <4 x float> %v1, <4 x float> undef, <4 x i32> + store volatile <4 x float> %br, <4 x float>* %b, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %k + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +declare float @floorf(float) nounwind readnone