Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -87,25 +88,21 @@ bool LegalTypes; bool ForCodeSize; - // Worklist of all of the nodes that need to be simplified. - // - // This has the semantics that when adding to the worklist, - // the item added must be next to be processed. It should - // also only appear once. The naive approach to this takes - // linear time. - // - // To reduce the insert/remove time to logarithmic, we use - // a set and a vector to maintain our worklist. - // - // The set contains the items on the worklist, but does not - // maintain the order they should be visited. - // - // The vector maintains the order nodes should be visited, but may - // contain duplicate or removed nodes. When choosing a node to - // visit, we pop off the order stack until we find an item that is - // also in the contents set. All operations are O(log N). - SmallPtrSet WorklistContents; - SmallVector WorklistOrder; + /// \brief Worklist of all of the nodes that need to be simplified. + /// + /// This must behave as a stack -- new nodes to process are pushed onto the + /// back and when processing we pop off of the back. + /// + /// The worklist will not contain duplicates but may contain null entries + /// due to nodes being deleted from the underlying DAG. + SmallVector Worklist; + + /// \brief Mapping from an SDNode to its position on the worklist. + /// + /// This is used to find and remove nodes from the worklist (by nulling + /// them) when they are deleted from the underlying DAG. It relies on + /// stable indices of nodes within the worklist. + DenseMap WorklistMap; // AA - Used for DAG load/store alias analysis. AliasAnalysis &AA; @@ -132,16 +129,24 @@ if (N->getOpcode() == ISD::HANDLENODE) return; - WorklistContents.insert(N); - WorklistOrder.push_back(N); + if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) + Worklist.push_back(N); } /// removeFromWorklist - remove all instances of N from the worklist. /// void removeFromWorklist(SDNode *N) { - WorklistContents.erase(N); + auto It = WorklistMap.find(N); + if (It == WorklistMap.end()) + return; // Not in the worklist. + + // Null out the entry rather than erasing it to avoid a linear operation. + Worklist[It->second] = nullptr; + WorklistMap.erase(It); } + bool recursivelyDeleteUnusedNodes(SDNode *N); + SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo = true); @@ -1072,6 +1077,35 @@ return false; } +/// \brief Recursively delete a node which has no uses and any operands for +/// which it is the only use. +/// +/// Note that this both deletes the nodes and removes them from the worklist. +/// It also adds any nodes who have had a user deleted to the worklist as they +/// may now have only one use and subject to other combines. +bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) { + if (!N->use_empty()) + return false; + + SmallSetVector Nodes; + Nodes.insert(N); + do { + N = Nodes.pop_back_val(); + if (!N) + continue; + + if (N->use_empty()) { + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + Nodes.insert(N->getOperand(i).getNode()); + + removeFromWorklist(N); + DAG.DeleteNode(N); + } else { + AddToWorklist(N); + } + } while (!Nodes.empty()); + return true; +} //===----------------------------------------------------------------------===// // Main DAG Combiner implementation @@ -1099,27 +1133,25 @@ // while the worklist isn't empty, find a node and // try and combine it. - while (!WorklistContents.empty()) { + while (!WorklistMap.empty()) { SDNode *N; - // The WorklistOrder holds the SDNodes in order, but it may contain - // duplicates. - // In order to avoid a linear scan, we use a set (O(log N)) to hold what the - // worklist *should* contain, and check the node we want to visit is should - // actually be visited. + // The Worklist holds the SDNodes in order, but it may contain null entries. do { - N = WorklistOrder.pop_back_val(); - } while (!WorklistContents.erase(N)); + N = Worklist.pop_back_val(); + } while (!N); + + bool GoodWorklistEntry = WorklistMap.erase(N); + (void)GoodWorklistEntry; + assert(GoodWorklistEntry && + "Found a worklist entry without a corresponding map entry!"); // If N has no uses, it is dead. Make sure to revisit all N's operands once // N is deleted from the DAG, since they too may now be dead or may have a // reduced number of uses, allowing other xforms. - if (N->use_empty()) { - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) - AddToWorklist(N->getOperand(i).getNode()); - - DAG.DeleteNode(N); + if (recursivelyDeleteUnusedNodes(N)) continue; - } + + WorklistRemover DeadNodes(*this); SDValue RV = combine(N); @@ -1147,7 +1179,6 @@ // Transfer debug value. DAG.TransferDbgValues(SDValue(N, 0), RV); - WorklistRemover DeadNodes(*this); if (N->getNumValues() == RV.getNode()->getNumValues()) DAG.ReplaceAllUsesWith(N, RV.getNode()); else { @@ -1161,23 +1192,11 @@ AddToWorklist(RV.getNode()); AddUsersToWorklist(RV.getNode()); - // Add any uses of the old node to the worklist in case this node is the - // last one that uses them. They may become dead after this node is - // deleted. - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) - AddToWorklist(N->getOperand(i).getNode()); - // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to - // something else needing this node. - if (N->use_empty()) { - // Nodes can be reintroduced into the worklist. Make sure we do not - // process a node that has been replaced. - removeFromWorklist(N); - - // Finally, since the node is now dead, remove it from the graph. - DAG.DeleteNode(N); - } + // something else needing this node. This will also take care of adding any + // operands which have lost a user to the worklist. + recursivelyDeleteUnusedNodes(N); } // If the root changed (e.g. it was a dead load, update the root). Index: llvm/trunk/test/CodeGen/ARM/fold-stack-adjust.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/fold-stack-adjust.ll +++ llvm/trunk/test/CodeGen/ARM/fold-stack-adjust.ll @@ -167,9 +167,9 @@ define void @test_varsize(...) minsize { ; CHECK-T1-LABEL: test_varsize: ; CHECK-T1: sub sp, #16 -; CHECK-T1: push {r2, r3, r4, r5, r7, lr} +; CHECK-T1: push {r5, r6, r7, lr} ; ... -; CHECK-T1: pop {r2, r3, r4, r5, r7} +; CHECK-T1: pop {r2, r3, r7} ; CHECK-T1: pop {r3} ; CHECK-T1: add sp, #16 ; CHECK-T1: bx r3 Index: llvm/trunk/test/CodeGen/ARM/sxt_rot.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/sxt_rot.ll +++ llvm/trunk/test/CodeGen/ARM/sxt_rot.ll @@ -9,7 +9,8 @@ define signext i8 @test1(i32 %A) { ; CHECK: test1 -; CHECK: sxtb r0, r0, ror #8 +; CHECK: lsr r0, r0, #8 +; CHECK: sxtb r0, r0 %B = lshr i32 %A, 8 %C = shl i32 %A, 24 %D = or i32 %B, %C Index: llvm/trunk/test/CodeGen/PowerPC/complex-return.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/complex-return.ll +++ llvm/trunk/test/CodeGen/PowerPC/complex-return.ll @@ -24,10 +24,10 @@ } ; CHECK-LABEL: foo: -; CHECK: lfd 3 -; CHECK: lfd 4 ; CHECK: lfd 1 ; CHECK: lfd 2 +; CHECK: lfd 3 +; CHECK: lfd 4 define { float, float } @oof() nounwind { entry: Index: llvm/trunk/test/CodeGen/PowerPC/subsumes-pred-regs.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/subsumes-pred-regs.ll +++ llvm/trunk/test/CodeGen/PowerPC/subsumes-pred-regs.ll @@ -35,7 +35,7 @@ br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49 ; CHECK: .LBB0_7: -; CHECK: beq 1, .LBB0_10 +; CHECK: bne 1, .LBB0_10 ; CHECK: beq 0, .LBB0_10 ; CHECK: .LBB0_9: Index: llvm/trunk/test/CodeGen/R600/r600-export-fix.ll =================================================================== --- llvm/trunk/test/CodeGen/R600/r600-export-fix.ll +++ llvm/trunk/test/CodeGen/R600/r600-export-fix.ll @@ -3,9 +3,9 @@ ;CHECK: EXPORT T{{[0-9]}}.XYZW ;CHECK: EXPORT T{{[0-9]}}.0000 ;CHECK: EXPORT T{{[0-9]}}.0000 -;CHECK: EXPORT T{{[0-9]}}.0XZW +;CHECK: EXPORT T{{[0-9]}}.0XYZ ;CHECK: EXPORT T{{[0-9]}}.XYZW -;CHECK: EXPORT T{{[0-9]}}.YX00 +;CHECK: EXPORT T{{[0-9]}}.YZ00 ;CHECK: EXPORT T{{[0-9]}}.0000 ;CHECK: EXPORT T{{[0-9]}}.0000 Index: llvm/trunk/test/CodeGen/R600/swizzle-export.ll =================================================================== --- llvm/trunk/test/CodeGen/R600/swizzle-export.ll +++ llvm/trunk/test/CodeGen/R600/swizzle-export.ll @@ -94,7 +94,7 @@ ; EG-CHECK: @main2 ; EG-CHECK: T{{[0-9]+}}.XY__ -; EG-CHECK: T{{[0-9]+}}.YXZ0 +; EG-CHECK: T{{[0-9]+}}.ZXY0 define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { main_body: Index: llvm/trunk/test/CodeGen/Thumb2/thumb2-sxt_rot.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/thumb2-sxt_rot.ll +++ llvm/trunk/test/CodeGen/Thumb2/thumb2-sxt_rot.ll @@ -10,7 +10,8 @@ define signext i8 @test1(i32 %A) { ; CHECK: test1 -; CHECK: sxtb.w r0, r0, ror #8 +; CHECK: lsrs r0, r0, #8 +; CHECK: sxtb r0, r0 %B = lshr i32 %A, 8 %C = shl i32 %A, 24 %D = or i32 %B, %C Index: llvm/trunk/test/CodeGen/Thumb2/thumb2-uxt_rot.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/thumb2-uxt_rot.ll +++ llvm/trunk/test/CodeGen/Thumb2/thumb2-uxt_rot.ll @@ -25,7 +25,7 @@ define zeroext i32 @test3(i32 %A.u) { ; A8: test3 -; A8: uxth.w r0, r0, ror #8 +; A8: ubfx r0, r0, #8, #16 %B.u = lshr i32 %A.u, 8 %C.u = shl i32 %A.u, 24 %D.u = or i32 %B.u, %C.u Index: llvm/trunk/test/CodeGen/X86/avx512-zext-load-crash.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-zext-load-crash.ll +++ llvm/trunk/test/CodeGen/X86/avx512-zext-load-crash.ll @@ -1,14 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s - -define <8 x i16> @test_zext_load() { - ; CHECK: vmovq -entry: - %0 = load <2 x i16> ** undef, align 8 - %1 = getelementptr inbounds <2 x i16>* %0, i64 1 - %2 = load <2 x i16>* %0, align 1 - %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> - %4 = load <2 x i16>* %1, align 1 - %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> - %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> - ret <8 x i16> %6 -} Index: llvm/trunk/test/CodeGen/X86/block-placement.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/block-placement.ll +++ llvm/trunk/test/CodeGen/X86/block-placement.ll @@ -237,44 +237,6 @@ ret i32 %base } -define void @test_loop_rotate_reversed_blocks() { -; This test case (greatly reduced from an Olden bencmark) ensures that the loop -; rotate implementation doesn't assume that loops are laid out in a particular -; order. The first loop will get split into two basic blocks, with the loop -; header coming after the loop latch. -; -; CHECK: test_loop_rotate_reversed_blocks -; CHECK: %entry -; Look for a jump into the middle of the loop, and no branches mid-way. -; CHECK: jmp -; CHECK: %loop1 -; CHECK-NOT: j{{\w*}} .LBB{{.*}} -; CHECK: %loop1 -; CHECK: je - -entry: - %cond1 = load volatile i1* undef - br i1 %cond1, label %loop2.preheader, label %loop1 - -loop1: - call i32 @f() - %cond2 = load volatile i1* undef - br i1 %cond2, label %loop2.preheader, label %loop1 - -loop2.preheader: - call i32 @f() - %cond3 = load volatile i1* undef - br i1 %cond3, label %exit, label %loop2 - -loop2: - call i32 @f() - %cond4 = load volatile i1* undef - br i1 %cond4, label %exit, label %loop2 - -exit: - ret void -} - define i32 @test_loop_align(i32 %i, i32* %a) { ; Check that we provide basic loop body alignment with the block placement ; pass. Index: llvm/trunk/test/CodeGen/X86/divide-by-constant.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/divide-by-constant.ll +++ llvm/trunk/test/CodeGen/X86/divide-by-constant.ll @@ -31,6 +31,7 @@ ; CHECK-LABEL: test3: ; CHECK: movzbl 8(%esp), %eax ; CHECK-NEXT: imull $171, %eax +; CHECK-NEXT: andl $65024, %eax ; CHECK-NEXT: shrl $9, %eax ; CHECK-NEXT: ret } @@ -56,9 +57,10 @@ %div = sdiv i16 %x, 10 ret i16 %div ; CHECK-LABEL: test6: -; CHECK: imull $26215, %eax, %ecx -; CHECK: sarl $18, %ecx -; CHECK: shrl $15, %eax +; CHECK: imull $26215, %eax +; CHECK: movl %eax, %ecx +; CHECK: shrl $31, %ecx +; CHECK: sarl $18, %eax } define i32 @test7(i32 %x) nounwind { Index: llvm/trunk/test/CodeGen/X86/fold-pcmpeqd-0.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fold-pcmpeqd-0.ll +++ llvm/trunk/test/CodeGen/X86/fold-pcmpeqd-0.ll @@ -1,117 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s -; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s - -; i386 test has been disabled when scheduler 2-addr hack is disabled. - -; This testcase shouldn't need to spill the -1 value, -; so it should just use pcmpeqd to materialize an all-ones vector. -; For i386, cp load of -1 are folded. - -; With -regalloc=greedy, the live range is split before spilling, so the first -; pcmpeq doesn't get folded as a constant pool load. - -; I386-NOT: pcmpeqd -; I386: orps LCPI0_2, %xmm -; I386-NOT: pcmpeqd -; I386: orps LCPI0_2, %xmm - -; X86-64: pcmpeqd -; X86-64-NOT: pcmpeqd - - %struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }> - %struct._cl_image_format_t = type <{ i32, i32, i32 }> - %struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }> - -define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind { -entry: - %tmp3.i = load i32* null ; [#uses=1] - %cmp = icmp sgt i32 %tmp3.i, 200 ; [#uses=1] - br i1 %cmp, label %forcond, label %ifthen - -ifthen: ; preds = %entry - ret void - -forcond: ; preds = %entry - %tmp3.i536 = load i32* null ; [#uses=1] - %cmp12 = icmp slt i32 0, %tmp3.i536 ; [#uses=1] - br i1 %cmp12, label %forbody, label %afterfor - -forbody: ; preds = %forcond - %bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1] - %mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer ; <<4 x float>> [#uses=1] - %mul257 = fmul <4 x float> %mul233, zeroinitializer ; <<4 x float>> [#uses=1] - %mul275 = fmul <4 x float> %mul257, zeroinitializer ; <<4 x float>> [#uses=1] - %tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0] - %bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1] - %tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind ; <<4 x i32>> [#uses=1] - %tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind ; <<4 x float>> [#uses=1] - %sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70 ; <<4 x float>> [#uses=2] - %mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78 ; <<4 x float>> [#uses=1] - %add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 > ; <<4 x float>> [#uses=1] - %mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78 ; <<4 x float>> [#uses=1] - %add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1] - %bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float> ; <<4 x float>> [#uses=1] - %mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer ; <<4 x float>> [#uses=1] - %bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer ; <<4 x i32>> [#uses=1] - %xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1] - %orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102 ; <<4 x i32>> [#uses=1] - %bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float> ; <<4 x float>> [#uses=1] - %cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind ; <<4 x float>> [#uses=1] - %tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %sub140.i = fsub <4 x float> zeroinitializer, %tmp80 ; <<4 x float>> [#uses=1] - %bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 > ; <<4 x i32>> [#uses=0] - %mul171.i = fmul <4 x float> zeroinitializer, %sub140.i ; <<4 x float>> [#uses=1] - %add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1] - %bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float> ; <<4 x float>> [#uses=1] - %mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer ; <<4 x float>> [#uses=1] - %bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0] - %bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32> ; <<4 x i32>> [#uses=1] - %xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1] - %orps203.i = or <4 x i32> %andnps192.i, %xorps.i ; <<4 x i32>> [#uses=1] - %bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float> ; <<4 x float>> [#uses=1] - %mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer ; <<4 x float>> [#uses=1] - %mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer ; <<4 x float>> [#uses=2] - %mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer ; <<4 x float>> [#uses=1] - %tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer ; <<4 x i32>> [#uses=1] - %orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32> ; <<4 x i32>> [#uses=1] - %bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=2] - %andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4 ; <<4 x i32>> [#uses=1] - %bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32> ; <<4 x i32>> [#uses=1] - %not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1] - %andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7 ; <<4 x i32>> [#uses=1] - %orps.i9 = or <4 x i32> %andnps.i8, %andps.i5 ; <<4 x i32>> [#uses=1] - %bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float> ; <<4 x float>> [#uses=1] - %bitcast.i = bitcast <4 x float> %mul313 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andps.i = and <4 x i32> %bitcast.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %orps.i = or <4 x i32> zeroinitializer, %andps.i ; <<4 x i32>> [#uses=1] - %bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float> ; <<4 x float>> [#uses=1] - call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind - unreachable - -afterfor: ; preds = %forcond - ret void -} - -declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone - -declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone - -declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone - -declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone - -declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/narrow-shl-load.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/narrow-shl-load.ll +++ llvm/trunk/test/CodeGen/X86/narrow-shl-load.ll @@ -30,40 +30,6 @@ ret void } - -; DAGCombiner shouldn't fold the sdiv (ashr) away. -; rdar://8636812 -; CHECK-LABEL: test2: -; CHECK: sarl - -define i32 @test2() nounwind { -entry: - %i = alloca i32, align 4 - %j = alloca i8, align 1 - store i32 127, i32* %i, align 4 - store i8 0, i8* %j, align 1 - %tmp3 = load i32* %i, align 4 - %mul = mul nsw i32 %tmp3, 2 - %conv4 = trunc i32 %mul to i8 - %conv5 = sext i8 %conv4 to i32 - %div6 = sdiv i32 %conv5, 2 - %conv7 = trunc i32 %div6 to i8 - %conv9 = sext i8 %conv7 to i32 - %cmp = icmp eq i32 %conv9, -1 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - ret i32 0 - -if.end: ; preds = %entry - call void @abort() noreturn - unreachable -} - -declare void @abort() noreturn - -declare void @exit(i32) noreturn - ; DAG Combiner can't fold this into a load of the 1'th byte. ; PR8757 define i32 @test3(i32 *%P) nounwind ssp { Index: llvm/trunk/test/CodeGen/X86/store-narrow.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/store-narrow.ll +++ llvm/trunk/test/CodeGen/X86/store-narrow.ll @@ -34,8 +34,8 @@ ; X64: movb %sil, 1(%rdi) ; X32-LABEL: test2: -; X32: movb 8(%esp), %[[REG:[abcd]l]] -; X32: movb %[[REG]], 1(%{{.*}}) +; X32: movzbl 8(%esp), %e[[REG:[abcd]]]x +; X32: movb %[[REG]]l, 1(%{{.*}}) } define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp { Index: llvm/trunk/test/CodeGen/X86/vec_extract-sse4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_extract-sse4.ll +++ llvm/trunk/test/CodeGen/X86/vec_extract-sse4.ll @@ -4,8 +4,8 @@ ; CHECK-LABEL: @t1 ; CHECK: movl 4(%esp), %[[R0:e[abcd]x]] ; CHECK-NEXT: movl 8(%esp), %[[R1:e[abcd]x]] -; CHECK-NEXT: movl 12(%[[R1]]), %[[R2:e[abcd]x]] -; CHECK-NEXT: movl %[[R2]], (%[[R0]]) +; CHECK-NEXT: movss 12(%[[R1]]), %[[R2:xmm.*]] +; CHECK-NEXT: movss %[[R2]], (%[[R0]]) ; CHECK-NEXT: retl %X = load <4 x float>* %P1