Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -6268,10 +6268,10 @@ // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// -multiclass sse41_fp_unop_rm opcps, bits<8> opcpd, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - PatFrag mem_frag32, PatFrag mem_frag64, - Intrinsic V4F32Int, Intrinsic V2F64Int> { +multiclass sse41_fp_unop_p opcps, bits<8> opcpd, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + PatFrag mem_frag32, PatFrag mem_frag64, + Intrinsic V4F32Int, Intrinsic V2F64Int> { let ExeDomain = SSEPackedSingle in { // Intrinsic operation, reg. // Vector intrinsic operation, reg @@ -6312,35 +6312,73 @@ } // ExeDomain = SSEPackedDouble } -multiclass sse41_fp_binop_rm opcss, bits<8> opcsd, - string OpcodeStr, - Intrinsic F32Int, - Intrinsic F64Int, bit Is2Addr = 1> { -let ExeDomain = GenericDomain in { - // Operation, reg. - let hasSideEffects = 0 in +multiclass avx_fp_unop_rm opcss, bits<8> opcsd, + string OpcodeStr> { +let ExeDomain = GenericDomain, hasSideEffects = 0 in { def SSr : SS4AIi8, Sched<[WriteFAdd]>; - // Operation, mem. - let mayLoad = 1, hasSideEffects = 0 in + let mayLoad = 1 in def SSm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in + def SDr : SS4AIi8, Sched<[WriteFAdd]>; + + let mayLoad = 1 in + def SDm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = GenericDomain, hasSideEffects = 0 +} + +multiclass sse41_fp_unop_s opcss, bits<8> opcsd, + string OpcodeStr> { +let ExeDomain = GenericDomain, hasSideEffects = 0 in { + def SSr : SS4AIi8, Sched<[WriteFAdd]>; + + let mayLoad = 1 in + def SSm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; + + def SDr : SS4AIi8, Sched<[WriteFAdd]>; + + let mayLoad = 1 in + def SDm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = GenericDomain, hasSideEffects = 0 +} + +multiclass sse41_fp_binop_s opcss, bits<8> opcsd, + string OpcodeStr, + Intrinsic F32Int, + Intrinsic F64Int, bit Is2Addr = 1> { +let ExeDomain = GenericDomain, isCodeGenOnly = 1 in { def SSr_Int : SS4AIi8, Sched<[WriteFAdd]>; - // Intrinsic operation, mem. - let isCodeGenOnly = 1 in def SSm_Int : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; - // Operation, reg. - let hasSideEffects = 0 in - def SDr : SS4AIi8, Sched<[WriteFAdd]>; - - // Operation, mem. - let mayLoad = 1, hasSideEffects = 0 in - def SDm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; - - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in def SDr_Int : SS4AIi8, Sched<[WriteFAdd]>; - // Intrinsic operation, mem. - let isCodeGenOnly = 1 in def SDm_Int : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; -} // ExeDomain = GenericDomain +} // ExeDomain = GenericDomain, isCodeGenOnly = 1 } // FP round - roundss, roundps, roundsd, roundpd let Predicates = [HasAVX] in { // Intrinsic form - defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, - loadv4f32, loadv2f64, - int_x86_sse41_round_ps, - int_x86_sse41_round_pd>, VEX; - defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, - loadv8f32, loadv4f64, - int_x86_avx_round_ps_256, - int_x86_avx_round_pd_256>, VEX, VEX_L; - defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", - int_x86_sse41_round_ss, - int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, + loadv4f32, loadv2f64, + int_x86_sse41_round_ps, + int_x86_sse41_round_pd>, VEX; + defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, + loadv8f32, loadv4f64, + int_x86_avx_round_ps_256, + int_x86_avx_round_pd_256>, VEX, VEX_L; + defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", + int_x86_sse41_round_ss, + int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG; } let Predicates = [UseAVX] in { @@ -6498,34 +6509,37 @@ (VROUNDYPDr VR256:$src, (i32 0xB))>; } -defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, - memopv4f32, memopv2f64, - int_x86_sse41_round_ps, int_x86_sse41_round_pd>; +defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128, + memopv4f32, memopv2f64, int_x86_sse41_round_ps, + int_x86_sse41_round_pd>; + +defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">; + let Constraints = "$src1 = $dst" in -defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", +defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", int_x86_sse41_round_ss, int_x86_sse41_round_sd>; let Predicates = [UseSSE41] in { def : Pat<(ffloor FR32:$src), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; + (ROUNDSSr FR32:$src, (i32 0x9))>; def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; + (ROUNDSDr FR64:$src, (i32 0x9))>; def : Pat<(f32 (fnearbyint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + (ROUNDSSr FR32:$src, (i32 0xC))>; def : Pat<(f64 (fnearbyint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + (ROUNDSDr FR64:$src, (i32 0xC))>; def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; + (ROUNDSSr FR32:$src, (i32 0xA))>; def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; + (ROUNDSDr FR64:$src, (i32 0xA))>; def : Pat<(f32 (frint FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + (ROUNDSSr FR32:$src, (i32 0x4))>; def : Pat<(f64 (frint FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + (ROUNDSDr FR64:$src, (i32 0x4))>; def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; + (ROUNDSSr FR32:$src, (i32 0xB))>; def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; + (ROUNDSDr FR64:$src, (i32 0xB))>; def : Pat<(v4f32 (ffloor VR128:$src)), (ROUNDPSr VR128:$src, (i32 0x9))>; Index: test/CodeGen/X86/pr31143.ll =================================================================== --- test/CodeGen/X86/pr31143.ll +++ test/CodeGen/X86/pr31143.ll @@ -0,0 +1,30 @@ +; RUN: llc -mtriple=x86_64-pc-linux-gnu -mattr=+sse4.2 < %s | FileCheck %s + +; CHECK-LABEL: test: +; CHECK: movss {{.*}}, %[[XMM0:xmm[0-9]+]] +; CHECK: xorps %[[XMM1:xmm[0-9]+]], %[[XMM1]] +; CHECK: roundss $9, %[[XMM0]], %[[XMM1]] + +define void @test(float* nocapture %a, <4 x float>* nocapture %b, i32 %k) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %v = load float, float* %arrayidx, align 4 + %floor = call float @floorf(float %v) + %sub = fsub float %floor, %v + %v1 = insertelement <4 x float> undef, float %sub, i32 0 + %br = shufflevector <4 x float> %v1, <4 x float> undef, <4 x i32> + store volatile <4 x float> %br, <4 x float>* %b, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %k + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +declare float @floorf(float) nounwind readnone Index: test/CodeGen/X86/stack-folding-fp-sse42.ll =================================================================== --- test/CodeGen/X86/stack-folding-fp-sse42.ll +++ test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -978,9 +978,11 @@ } declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone +; Don't fold the non-int version of roundsd. define double @stack_fold_roundsd(double %a0) optsize { ;CHECK-LABEL: stack_fold_roundsd - ;CHECK: roundsd $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload + ;CHECK: movsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Reload + ;CHECK: roundsd $9, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call double @llvm.floor.f64(double %a0) ret double %2 @@ -996,9 +998,11 @@ } declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone +; Don't fold the non-int version of roundss. define float @stack_fold_roundss(float %a0) minsize { ;CHECK-LABEL: stack_fold_roundss - ;CHECK: roundss $9, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload + ;CHECK: movss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Reload + ;CHECK: roundss $9, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call float @llvm.floor.f32(float %a0) ret float %2