Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9665,6 +9665,28 @@ return false; } +static SDValue foldFPUndef(SelectionDAG &DAG, SDValue N0, SDValue N1) { + // fold (fadd x, undef) -> undef/NaN + if (N0.isUndef()) { + // If we know x cannot be NaN, then we propagate undef. + if (DAG.isKnownNeverNaN(N0)) + return N0; + + // TODO: If not, we should propagate NaN, but it's not clear how now. + } + + // fold (fadd undef, x) -> undef/NaN + if (N1.isUndef()) { + // If we know x cannot be NaN, then we propagate undef. + if (DAG.isKnownNeverNaN(N1)) + return N1; + + // TODO: If not, we should propagate NaN, but it's not clear how now. + } + + return SDValue(); +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -9675,6 +9697,10 @@ const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); + // fold (fadd x, undef) -> undef/NaN + if (SDValue X = foldFPUndef(DAG, N0, N1)) + return X; + // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) @@ -9839,6 +9865,10 @@ const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); + // fold (fsub x, undef) -> undef/NaN + if (SDValue X = foldFPUndef(DAG, N0, N1)) + return X; + // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) @@ -9910,6 +9940,10 @@ const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); + // fold (fmul x, undef) -> undef/NaN + if (SDValue X = foldFPUndef(DAG, N0, N1)) + return X; + // fold vector ops if (VT.isVector()) { // This just handles C1 * C2 for vectors. Other vector folds are below. Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3455,6 +3455,9 @@ if (Op->getFlags().hasNoNaNs()) return true; + if (Op.isUndef()) + return true; + // If the value is a constant, we can obviously see if it is a NaN or not. if (const ConstantFPSDNode *C = dyn_cast(Op)) return !C->getValueAPF().isNaN(); Index: test/CodeGen/X86/2012-04-26-sdglue.ll =================================================================== --- test/CodeGen/X86/2012-04-26-sdglue.ll +++ test/CodeGen/X86/2012-04-26-sdglue.ll @@ -4,54 +4,60 @@ ; rdar://11314175: SD Scheduler, BuildSchedUnits assert: ; N->getNodeId() == -1 && "Node already inserted! -define void @func() nounwind ssp { +define void @func(<4 x float> %a, <16 x i8> %b, <16 x i8> %c, <8 x float> %d, <8 x float> %e, <8 x float> %f, <4 x float> %g, <8 x float> %h, <8 x float> %i, <8 x float>* %j) nounwind ssp { ; CHECK-LABEL: func: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovups 0, %xmm0 -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] -; CHECK-NEXT: vbroadcastss 32, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; CHECK-NEXT: vmulps %ymm0, %ymm2, %ymm2 -; CHECK-NEXT: vmulps %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: andq $-32, %rsp +; CHECK-NEXT: subq $32, %rsp +; CHECK-NEXT: vmovdqu 0, %xmm8 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm9 +; CHECK-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; CHECK-NEXT: vmovdqu 32, %xmm0 +; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vmulps %ymm3, %ymm9, %ymm1 +; CHECK-NEXT: vmulps %ymm4, %ymm0, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm5, %ymm0, %ymm0 +; CHECK-NEXT: vmulps %xmm0, %xmm6, %xmm0 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm7, %ymm0 ; CHECK-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vsubps %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vsubps 16(%rbp), %ymm0, %ymm0 +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vhaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %tmp = load <4 x float>, <4 x float>* null, align 1 %tmp14 = getelementptr <4 x float>, <4 x float>* null, i32 2 %tmp15 = load <4 x float>, <4 x float>* %tmp14, align 1 %tmp16 = shufflevector <4 x float> %tmp, <4 x float> , <8 x i32> - %tmp17 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %tmp16, <4 x float> undef, i8 1) + %tmp17 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %tmp16, <4 x float> %a, i8 1) %tmp18 = bitcast <4 x float> %tmp to <16 x i8> - %tmp19 = shufflevector <16 x i8> %tmp18, <16 x i8> undef, <16 x i32> + %tmp19 = shufflevector <16 x i8> %tmp18, <16 x i8> %b, <16 x i32> %tmp20 = bitcast <16 x i8> %tmp19 to <4 x float> %tmp21 = bitcast <4 x float> %tmp15 to <16 x i8> - %tmp22 = shufflevector <16 x i8> undef, <16 x i8> %tmp21, <16 x i32> + %tmp22 = shufflevector <16 x i8> %c, <16 x i8> %tmp21, <16 x i32> %tmp23 = bitcast <16 x i8> %tmp22 to <4 x float> %tmp24 = shufflevector <4 x float> %tmp20, <4 x float> , <8 x i32> %tmp25 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %tmp24, <4 x float> %tmp23, i8 1) - %tmp26 = fmul <8 x float> %tmp17, undef - %tmp27 = fmul <8 x float> %tmp25, undef + %tmp26 = fmul <8 x float> %tmp17, %d + %tmp27 = fmul <8 x float> %tmp25, %e %tmp28 = fadd <8 x float> %tmp26, %tmp27 - %tmp29 = fadd <8 x float> %tmp28, undef + %tmp29 = fadd <8 x float> %tmp28, %f %tmp30 = shufflevector <8 x float> %tmp29, <8 x float> undef, <4 x i32> - %tmp31 = fmul <4 x float> undef, %tmp30 + %tmp31 = fmul <4 x float> %g, %tmp30 %tmp32 = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> zeroinitializer, <4 x float> %tmp31, i8 1) - %tmp33 = fadd <8 x float> undef, %tmp32 + %tmp33 = fadd <8 x float> %h, %tmp32 %tmp34 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %tmp33, <8 x float> undef) nounwind - %tmp35 = fsub <8 x float> %tmp34, undef + %tmp35 = fsub <8 x float> %tmp34, %i %tmp36 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> zeroinitializer, <8 x float> %tmp35) nounwind - store <8 x float> %tmp36, <8 x float>* undef, align 32 + store <8 x float> %tmp36, <8 x float>* %j, align 32 ret void } Index: test/CodeGen/X86/pr23103.ll =================================================================== --- test/CodeGen/X86/pr23103.ll +++ test/CodeGen/X86/pr23103.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+avx < %s | FileCheck %s ; When commuting a VADDSDrr instruction, verify that the 'IsUndef' flag is @@ -8,11 +9,13 @@ define <1 x double> @pr23103(<1 x double>* align 8 %Vp) { ; CHECK-LABEL: pr23103: -; CHECK: vmovsd (%rdi), %xmm0 -; CHECK-NEXT: vmovsd %xmm0, {{.*}}(%rsp) {{.*#+}} 8-byte Spill +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq foo -; CHECK-NEXT: vaddsd {{.*}}(%rsp), %xmm0, %xmm0 {{.*#+}} 8-byte Folded Reload -; CHECK: retq +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq entry: %V = load <1 x double>, <1 x double>* %Vp, align 8 %call = call zeroext i1 @foo(<1 x double> %V) Index: test/CodeGen/X86/pr34177.ll =================================================================== --- test/CodeGen/X86/pr34177.ll +++ test/CodeGen/X86/pr34177.ll @@ -5,48 +5,60 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define void @test() local_unnamed_addr { +define void @test(<4x i64> %a, <4 x x86_fp80> %b, <4 x x86_fp80> %c, <8 x x86_fp80>* %d) local_unnamed_addr { ; CHECK-LABEL: test: ; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3] -; CHECK-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-NEXT: vmovq %xmm0, %rcx -; CHECK-NEXT: negq %rdx +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3] +; CHECK-NEXT: vmovq %xmm1, %r8 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vmovq %xmm2, %rcx +; CHECK-NEXT: vpextrq $1, %xmm1, %rdx +; CHECK-NEXT: vpextrq $1, %xmm2, %rsi +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: negq %rax ; CHECK-NEXT: fld1 ; CHECK-NEXT: fldz ; CHECK-NEXT: fld %st(0) ; CHECK-NEXT: fcmove %st(2), %st(0) -; CHECK-NEXT: cmpq %rax, %rcx -; CHECK-NEXT: fld %st(1) -; CHECK-NEXT: fcmove %st(3), %st(0) -; CHECK-NEXT: cmpq %rax, %rax -; CHECK-NEXT: fld %st(2) -; CHECK-NEXT: fcmove %st(4), %st(0) -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: cmpq %rax, %rax -; CHECK-NEXT: fld %st(3) -; CHECK-NEXT: fcmove %st(5), %st(0) -; CHECK-NEXT: fstp %st(5) -; CHECK-NEXT: fxch %st(2) -; CHECK-NEXT: fadd %st(3) -; CHECK-NEXT: fxch %st(4) -; CHECK-NEXT: fadd %st(3) -; CHECK-NEXT: fxch %st(2) -; CHECK-NEXT: fadd %st(3) -; CHECK-NEXT: fxch %st(1) ; CHECK-NEXT: faddp %st(3) +; CHECK-NEXT: cmpq %rsi, %rdx +; CHECK-NEXT: fld %st(0) +; CHECK-NEXT: fcmove %st(2), %st(0) +; CHECK-NEXT: faddp %st(5) +; CHECK-NEXT: cmpq %rcx, %r8 +; CHECK-NEXT: fld %st(0) +; CHECK-NEXT: fcmove %st(2), %st(0) +; CHECK-NEXT: faddp %st(4) +; CHECK-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-NEXT: movl $1, %ecx +; CHECK-NEXT: cmpq %rax, %rcx +; CHECK-NEXT: fcmove %st(1), %st(0) +; CHECK-NEXT: fstp %st(1) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: faddp %st(1) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt 70(%rdi) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt 50(%rdi) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt 30(%rdi) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt 10(%rdi) ; CHECK-NEXT: fxch %st(3) -; CHECK-NEXT: fstpt (%rax) +; CHECK-NEXT: fstpt 60(%rdi) ; CHECK-NEXT: fxch %st(1) -; CHECK-NEXT: fstpt (%rax) +; CHECK-NEXT: fstpt 40(%rdi) ; CHECK-NEXT: fxch %st(1) -; CHECK-NEXT: fstpt (%rax) -; CHECK-NEXT: fstpt (%rax) - %1 = icmp eq <4 x i64> , undef +; CHECK-NEXT: fstpt 20(%rdi) +; CHECK-NEXT: fstpt (%rdi) + %1 = icmp eq <4 x i64> , %a %2 = select <4 x i1> %1, <4 x x86_fp80> , <4 x x86_fp80> zeroinitializer - %3 = fadd <4 x x86_fp80> undef, %2 - %4 = shufflevector <4 x x86_fp80> %3, <4 x x86_fp80> undef, <8 x i32> - store <8 x x86_fp80> %4, <8 x x86_fp80>* undef, align 16 + %3 = fadd <4 x x86_fp80> %b, %2 + %4 = shufflevector <4 x x86_fp80> %3, <4 x x86_fp80> %c, <8 x i32> + store <8 x x86_fp80> %4, <8 x x86_fp80>* %d, align 16 unreachable } Index: test/CodeGen/X86/sse3-avx-addsub-2.ll =================================================================== --- test/CodeGen/X86/sse3-avx-addsub-2.ll +++ test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -400,51 +400,51 @@ ret <4 x float> %vecinsert2 } -define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { +define <4 x float> @test16(<4 x float> %A, <4 x float> %B, float %C, float %D) { ; SSE-LABEL: test16: ; SSE: # BB#0: +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: subss %xmm2, %xmm4 ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] -; SSE-NEXT: subss %xmm4, %xmm3 -; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE-NEXT: addss %xmm0, %xmm4 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1] +; SSE-NEXT: subss %xmm5, %xmm2 +; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE-NEXT: addss %xmm3, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test16: ; AVX: # BB#0: -; AVX-NEXT: vsubss %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm4 +; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4 +; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm3, %xmm5, %xmm3 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX-NEXT: retq %1 = extractelement <4 x float> %A, i32 0 %2 = extractelement <4 x float> %B, i32 0 - %sub = fsub float %1, undef + %sub = fsub float %1, %C %3 = extractelement <4 x float> %A, i32 2 %4 = extractelement <4 x float> %B, i32 2 %sub2 = fsub float %3, %4 %5 = extractelement <4 x float> %A, i32 1 %6 = extractelement <4 x float> %B, i32 1 - %add = fadd float %5, undef + %add = fadd float %5, %D %7 = extractelement <4 x float> %A, i32 3 %8 = extractelement <4 x float> %B, i32 3 %add2 = fadd float %7, %8