Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -104,6 +104,12 @@ /// stable indices of nodes within the worklist. DenseMap WorklistMap; + /// \brief Set of nodes which have been combined (at least once). + /// + /// This is used to allow us to reliably add any operands of a DAG node + /// which have not yet been combined to the worklist. + SmallPtrSet CombinedNodes; + // AA - Used for DAG load/store alias analysis. AliasAnalysis &AA; @@ -136,6 +142,8 @@ /// removeFromWorklist - remove all instances of N from the worklist. /// void removeFromWorklist(SDNode *N) { + CombinedNodes.erase(N); + auto It = WorklistMap.find(N); if (It == WorklistMap.end()) return; // Not in the worklist. @@ -1151,6 +1159,17 @@ if (recursivelyDeleteUnusedNodes(N)) continue; + DEBUG(dbgs() << "\nCombining: "; + N->dump(&DAG)); + + // Add any operands of the new node which have not yet been combined to the + // worklist as well. Because the worklist uniques things already, this + // won't repeatedly process the same operand. + CombinedNodes.insert(N); + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + if (!CombinedNodes.count(N->getOperand(i).getNode())) + AddToWorklist(N->getOperand(i).getNode()); + WorklistRemover DeadNodes(*this); SDValue RV = combine(N); @@ -1171,11 +1190,8 @@ RV.getNode()->getOpcode() != ISD::DELETED_NODE && "Node was deleted but visit returned new node!"); - DEBUG(dbgs() << "\nReplacing.3 "; - N->dump(&DAG); - dbgs() << "\nWith: "; - RV.getNode()->dump(&DAG); - dbgs() << '\n'); + DEBUG(dbgs() << " ... into: "; + RV.getNode()->dump(&DAG)); // Transfer debug value. DAG.TransferDbgValues(SDValue(N, 0), RV); Index: test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll =================================================================== --- test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll +++ /dev/null @@ -1,46 +0,0 @@ -; RUN: llc -O3 < %s | FileCheck %s -; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s -; Test case for a DAG combiner bug where we combined an indexed load -; with an extension (sext, zext, or any) into a regular extended load, -; i.e., dropping the indexed value. -; - -target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" -target triple = "arm64-apple-ios" - -%class.A = type { i64, i64 } -%class.C = type { i64 } - -; CHECK-LABEL: XX: -; CHECK: ldr -define i32 @XX(%class.A* %K, i1 %tst, i32* %addr, %class.C** %ppC, %class.C* %pC) { -entry: - br i1 %tst, label %if.then, label %lor.rhs.i - -lor.rhs.i: ; preds = %entry - %tmp = load i32* %addr, align 4 - %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1 - %tmp1 = load i64* %y.i.i.i, align 8 - %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32 - %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17 - %add12.i = add nsw i32 0, %div11.i - %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32 - %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32 - %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13 - %add16.i = add nsw i32 %add12.i, %div15.i - %rem.i.i = srem i32 %add16.i, %tmp - %idxprom = sext i32 %rem.i.i to i64 - %arrayidx = getelementptr inbounds %class.C** %ppC, i64 %idxprom - %tobool533 = icmp eq %class.C* %pC, null - br i1 %tobool533, label %while.end, label %while.body - -if.then: ; preds = %entry - ret i32 42 - -while.body: ; preds = %lor.rhs.i - ret i32 5 - -while.end: ; preds = %lor.rhs.i - %tmp3 = load %class.C** %arrayidx, align 8 - ret i32 50 -} Index: test/CodeGen/ARM/aapcs-hfa-code.ll =================================================================== --- test/CodeGen/ARM/aapcs-hfa-code.ll +++ test/CodeGen/ARM/aapcs-hfa-code.ll @@ -92,12 +92,10 @@ call arm_aapcs_vfpcc void @test_1double_misaligned([4 x double] undef, [4 x double] undef, float undef, double 1.0) ; CHECK-LABEL: test_1double_misaligned: -; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0 -; CHECK-DAG: mov r[[BASE:[0-9]+]], sp ; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0 +; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0 ; CHECK-DAG: movt [[ONEHI]], #16368 -; CHECK-DAG: str [[ONELO]], [r[[BASE]], #8]! -; CHECK-DAG: str [[ONEHI]], [r[[BASE]], #4] +; CHECK-DAG: strd [[ONELO]], [[ONEHI]], [sp, #8] ; CHECK-M4F-LABEL: test_1double_misaligned: ; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0 Index: test/CodeGen/Mips/cmov.ll =================================================================== --- test/CodeGen/Mips/cmov.ll +++ test/CodeGen/Mips/cmov.ll @@ -757,24 +757,9 @@ ; ALL-LABEL: slti6: -; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7 -; 32-CMOV-DAG: xori [[R1]], [[R1]], 1 -; 32-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3 -; 32-CMOV-NOT: movn - -; 32-CMP-DAG: slti [[R1:\$[0-9]+]], $4, 7 -; 32-CMP-DAG: xori [[R1]], [[R1]], 1 -; 32-CMP-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3 -; 32-CMP-NOT: seleqz -; 32-CMP-NOT: selnez - -; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7 -; 64-CMOV-DAG: xori [[R1]], [[R1]], 1 -; 64-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3 -; 64-CMOV-NOT: movn - -; 64-CMP-DAG: slti [[R1:\$[0-9]+]], $4, 7 -; 64-CMP-DAG: xori [[R1]], [[R1]], 1 -; 64-CMP-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3 -; 64-CMP-NOT: seleqz -; 64-CMP-NOT: selnez +; ALL-DAG: addiu [[R1:\$[0-9]+]], $zero, 6 +; ALL-DAG: slt [[R1]], [[R1]], $4 +; ALL-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3 +; ALL-NOT: movn +; ALL-NOT: seleqz +; ALL-NOT: selnez Index: test/CodeGen/R600/add_i64.ll =================================================================== --- test/CodeGen/R600/add_i64.ll +++ test/CodeGen/R600/add_i64.ll @@ -70,8 +70,8 @@ } ; SI-LABEL: @trunc_i64_add_to_i32 -; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]] -; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]] +; SI: S_LOAD_DWORD s[[SREG0:[0-9]+]] +; SI: S_LOAD_DWORD s[[SREG1:[0-9]+]] ; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]] ; SI-NOT: ADDC ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] Index: test/CodeGen/R600/or.ll =================================================================== --- test/CodeGen/R600/or.ll +++ test/CodeGen/R600/or.ll @@ -116,10 +116,10 @@ } ; SI-LABEL: @trunc_i64_or_to_i32 -; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]] -; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]] -; SI: S_OR_B32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]] -; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: S_LOAD_DWORD s[[SREG0:[0-9]+]] +; SI: S_LOAD_DWORD s[[SREG1:[0-9]+]] +; SI: S_OR_B32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]] +; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]] ; SI: BUFFER_STORE_DWORD [[VRESULT]], define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { %add = or i64 %b, %a Index: test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll =================================================================== --- test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll +++ test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll @@ -7,7 +7,7 @@ %tmp1 = bitcast double %a to <8 x i8> %tmp2 = bitcast double %b to <8 x i8> %tmp3 = add <8 x i8> %tmp1, %tmp2 -; CHECK: paddw +; CHECK: paddb store <8 x i8> %tmp3, <8 x i8>* null ret void } @@ -18,7 +18,7 @@ %tmp1 = bitcast double %a to <4 x i16> %tmp2 = bitcast double %b to <4 x i16> %tmp3 = add <4 x i16> %tmp1, %tmp2 -; CHECK: paddd +; CHECK: paddw store <4 x i16> %tmp3, <4 x i16>* null ret void } @@ -29,7 +29,7 @@ %tmp1 = bitcast double %a to <2 x i32> %tmp2 = bitcast double %b to <2 x i32> %tmp3 = add <2 x i32> %tmp1, %tmp2 -; CHECK: paddq +; CHECK: paddd store <2 x i32> %tmp3, <2 x i32>* null ret void } Index: test/CodeGen/X86/avx-sext.ll =================================================================== --- test/CodeGen/X86/avx-sext.ll +++ test/CodeGen/X86/avx-sext.ll @@ -177,9 +177,17 @@ } ; AVX: sext_4i8_to_4i64 -; AVX: vpmovsxbd -; AVX: vpmovsxdq -; AVX: vpmovsxdq +; AVX: movsbq +; AVX: vmovq +; AVX: movsbq +; AVX: vmovq +; AVX: vpunpcklqdq +; AVX: movsbq +; AVX: vmovq +; AVX: movsbq +; AVX: vmovq +; AVX: vpunpcklqdq +; AVX: vinsertf128 ; AVX: ret define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { %X = load <4 x i8>* %ptr @@ -188,9 +196,17 @@ } ; AVX: sext_4i16_to_4i64 -; AVX: vpmovsxwd -; AVX: vpmovsxdq -; AVX: vpmovsxdq +; AVX: movswq +; AVX: vmovq +; AVX: movswq +; AVX: vmovq +; AVX: vpunpcklqdq +; AVX: movswq +; AVX: vmovq +; AVX: movswq +; AVX: vmovq +; AVX: vpunpcklqdq +; AVX: vinsertf128 ; AVX: ret define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { %X = load <4 x i16>* %ptr Index: test/CodeGen/X86/i8-umulo.ll =================================================================== --- test/CodeGen/X86/i8-umulo.ll +++ test/CodeGen/X86/i8-umulo.ll @@ -3,7 +3,7 @@ declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b) define i8 @testumulo(i32 %argc) { -; CHECK: imulw +; CHECK: imull ; CHECK: testb %{{.+}}, %{{.+}} ; CHECK: je [[NOOVERFLOWLABEL:.+]] ; CHECK: {{.*}}[[NOOVERFLOWLABEL]]: Index: test/CodeGen/X86/jump_sign.ll =================================================================== --- test/CodeGen/X86/jump_sign.ll +++ test/CodeGen/X86/jump_sign.ll @@ -284,7 +284,7 @@ define i32 @func_test1(i32 %p1) nounwind uwtable { entry: ; CHECK-LABEL: func_test1: -; CHECK: testb +; CHECK: andb ; CHECK: j ; CHECK: ret %0 = load i32* @b, align 4 Index: test/CodeGen/X86/lower-bitcast.ll =================================================================== --- test/CodeGen/X86/lower-bitcast.ll +++ test/CodeGen/X86/lower-bitcast.ll @@ -68,13 +68,13 @@ %2 = bitcast <2 x i32> %add to i64 ret i64 %2 } -; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd. +; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd. ; Ideally, we should fold that sequence into a single paddd. This is fixed with ; the widening legalization. ; ; CHECK-LABEL: test4 ; CHECK: pshufd -; CHECK-NEXT: paddq +; CHECK-NEXT: paddd ; CHECK-NEXT: pshufd ; CHECK: ret ; Index: test/CodeGen/X86/pr15267.ll =================================================================== --- test/CodeGen/X86/pr15267.ll +++ test/CodeGen/X86/pr15267.ll @@ -48,19 +48,22 @@ ; CHECK: test3 ; CHECK: movzbl -; CHECK: shrl -; CHECK: andl $1 -; CHECK: andl $1 -; CHECK: vmovd -; CHECK: pinsrd $1 -; CHECK: shrl $2 -; CHECK: andl $1 -; CHECK: pinsrd $2 -; CHECK: shrl $3 -; CHECK: andl $1 -; CHECK: pinsrd $3 -; CHECK: pslld -; CHECK: psrad -; CHECK: pmovsxdq -; CHECK: pmovsxdq +; CHECK: movq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: movq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: vpunpcklqdq +; CHECK: movq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: vpunpcklqdq +; CHECK: vinsertf128 ; CHECK: ret Index: test/CodeGen/X86/store-narrow.ll =================================================================== --- test/CodeGen/X86/store-narrow.ll +++ test/CodeGen/X86/store-narrow.ll @@ -34,7 +34,7 @@ ; X64: movb %sil, 1(%rdi) ; X32-LABEL: test2: -; X32: movzbl 8(%esp), %e[[REG:[abcd]]]x +; X32: movb 8(%esp), %[[REG:[abcd]]]l ; X32: movb %[[REG]]l, 1(%{{.*}}) } @@ -67,8 +67,8 @@ ; X64: movw %si, 2(%rdi) ; X32-LABEL: test4: -; X32: movl 8(%esp), %e[[REG:[abcd]x]] -; X32: movw %[[REG]], 2(%{{.*}}) +; X32: movw 8(%esp), %[[REG:[abcd]]]x +; X32: movw %[[REG]]x, 2(%{{.*}}) } define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp { @@ -84,8 +84,8 @@ ; X64: movw %si, 2(%rdi) ; X32-LABEL: test5: -; X32: movzwl 8(%esp), %e[[REG:[abcd]x]] -; X32: movw %[[REG]], 2(%{{.*}}) +; X32: movw 8(%esp), %[[REG:[abcd]]]x +; X32: movw %[[REG]]x, 2(%{{.*}}) } define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp { Index: test/CodeGen/X86/trunc-ext-ld-st.ll =================================================================== --- test/CodeGen/X86/trunc-ext-ld-st.ll +++ test/CodeGen/X86/trunc-ext-ld-st.ll @@ -32,7 +32,7 @@ ;CHECK-LABEL: load_2_i32: ;CHECK: pmovzxdq -;CHECK: paddq +;CHECK: paddd ;CHECK: pshufd ;CHECK: ret define void @load_2_i32(<2 x i32>* %A) { @@ -56,7 +56,7 @@ ;CHECK-LABEL: load_4_i16: ;CHECK: pmovzxwd -;CHECK: paddd +;CHECK: paddw ;CHECK: pshufb ;CHECK: ret define void @load_4_i16(<4 x i16>* %A) { @@ -68,7 +68,7 @@ ;CHECK-LABEL: load_8_i8: ;CHECK: pmovzxbw -;CHECK: paddw +;CHECK: paddb ;CHECK: pshufb ;CHECK: ret define void @load_8_i8(<8 x i8>* %A) { Index: test/CodeGen/X86/vector-idiv.ll =================================================================== --- test/CodeGen/X86/vector-idiv.ll +++ test/CodeGen/X86/vector-idiv.ll @@ -122,7 +122,7 @@ ; SSE41-LABEL: test8: ; SSE41: pmuldq ; SSE41: pshufd $49 -; SSE41-NOT: pshufd $49 +; SSE41: pshufd $49 ; SSE41: pmuldq ; SSE41: shufps $-35 ; SSE41: pshufd $-40 @@ -134,7 +134,7 @@ ; SSE-LABEL: test8: ; SSE: pmuludq ; SSE: pshufd $49 -; SSE-NOT: pshufd $49 +; SSE: pshufd $49 ; SSE: pmuludq ; SSE: shufps $-35 ; SSE: pshufd $-40 @@ -147,7 +147,7 @@ ; AVX-LABEL: test8: ; AVX: vpmuldq ; AVX: vpshufd $49 -; AVX-NOT: vpshufd $49 +; AVX: vpshufd $49 ; AVX: vpmuldq ; AVX: vshufps $-35 ; AVX: vpshufd $-40 @@ -162,10 +162,12 @@ ret <8 x i32> %div ; AVX-LABEL: test9: -; AVX: vpalignr $4 ; AVX: vpbroadcastd +; AVX: vpalignr $4 +; AVX: vpalignr $4 ; AVX: vpmuldq ; AVX: vpmuldq +; AVX: vpalignr $4 ; AVX: vpblendd $170 ; AVX: vpadd ; AVX: vpsrld $31 @@ -195,10 +197,12 @@ ret <8 x i32> %rem ; AVX-LABEL: test11: -; AVX: vpalignr $4 ; AVX: vpbroadcastd +; AVX: vpalignr $4 +; AVX: vpalignr $4 ; AVX: vpmuldq ; AVX: vpmuldq +; AVX: vpalignr $4 ; AVX: vpblendd $170 ; AVX: vpadd ; AVX: vpsrld $31 Index: test/CodeGen/X86/widen_cast-1.ll =================================================================== --- test/CodeGen/X86/widen_cast-1.ll +++ test/CodeGen/X86/widen_cast-1.ll @@ -2,12 +2,12 @@ ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s ; CHECK: movl -; CHECK: paddd +; CHECK: paddw ; CHECK: movlpd ; Scheduler causes produce a different instruction order ; ATOM: movl -; ATOM: paddd +; ATOM: paddw ; ATOM: movlpd ; bitcast a v4i16 to v2i32 Index: test/CodeGen/X86/widen_conv-1.ll =================================================================== --- test/CodeGen/X86/widen_conv-1.ll +++ test/CodeGen/X86/widen_conv-1.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -; CHECK: paddq +; CHECK: paddd ; truncate v2i64 to v2i32 Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -92,10 +92,9 @@ %i16vec4 = type <4 x i16> define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind { ; CHECK-LABEL: add4i16: -; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]] -; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]] -; CHECK-NEXT: paddd %[[R0]], %[[R1]] -; CHECK-NEXT: pshufb {{.*}}, %[[R1]] +; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddw %[[R0]], %[[R1]] ; CHECK-NEXT: movq %[[R1]], (%{{.*}}) %a = load %i16vec4* %ap, align 16 %b = load %i16vec4* %bp, align 16 Index: test/CodeGen/X86/x86-64-tls-1.ll =================================================================== --- test/CodeGen/X86/x86-64-tls-1.ll +++ test/CodeGen/X86/x86-64-tls-1.ll @@ -1,10 +1,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s @tm_nest_level = internal thread_local global i32 0 define i64 @z() nounwind { -; FIXME: The codegen here is primitive at best and could be much better. -; The add and the moves can be folded together. -; CHECK-DAG: movq $tm_nest_level@TPOFF, %rcx -; CHECK-DAG: movq %fs:0, %rax -; CHECK: addl %ecx, %eax +; CHECK: movq $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x +; CHECK-NEXT: addl %fs:0, %e[[R0]]x +; CHECK-NEXT: andq $100, %r[[R0]]x + ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100) } Index: test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll =================================================================== --- test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll +++ test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll @@ -2,10 +2,10 @@ define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind { ; CHECK-LABEL: LCPI0_0 -; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 -; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 -; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 -; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 +; CHECK-NEXT: .long 1065353216 +; CHECK-NEXT: .long 1065353216 +; CHECK-NEXT: .long 1065353216 +; CHECK-NEXT: .long 1065353216 ; CHECK-LABEL: foo: ; CHECK: cmpeqps %xmm1, %xmm0 ; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0