Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1198,6 +1198,11 @@ Level = AtLevel; LegalOperations = Level >= AfterLegalizeVectorOps; LegalTypes = Level >= AfterLegalizeTypes; + bool LegalizeOps = Level == AfterLegalizeDAG; + bool DoCombine = OptLevel != CodeGenOpt::None; + + if (!DoCombine && !LegalizeOps) + return; // Add all the dag nodes to the worklist. for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), @@ -1233,7 +1238,7 @@ // If this combine is running after legalizing the DAG, re-legalize any // nodes pulled off the worklist. - if (Level == AfterLegalizeDAG) { + if (LegalizeOps) { SmallSetVector UpdatedNodes; bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); @@ -1245,52 +1250,54 @@ continue; } - DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); - - // Add any operands of the new node which have not yet been combined to the - // worklist as well. Because the worklist uniques things already, this - // won't repeatedly process the same operand. - CombinedNodes.insert(N); - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) - if (!CombinedNodes.count(N->getOperand(i).getNode())) - AddToWorklist(N->getOperand(i).getNode()); + if (DoCombine) { + DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); - SDValue RV = combine(N); + // Add any operands of the new node which have not yet been combined to the + // worklist as well. Because the worklist uniques things already, this + // won't repeatedly process the same operand. + CombinedNodes.insert(N); + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + if (!CombinedNodes.count(N->getOperand(i).getNode())) + AddToWorklist(N->getOperand(i).getNode()); - if (!RV.getNode()) - continue; + SDValue RV = combine(N); - ++NodesCombined; + if (!RV.getNode()) + continue; - // If we get back the same node we passed in, rather than a new node or - // zero, we know that the node must have defined multiple values and - // CombineTo was used. Since CombineTo takes care of the worklist - // mechanics for us, we have no work to do in this case. - if (RV.getNode() == N) - continue; + ++NodesCombined; - assert(N->getOpcode() != ISD::DELETED_NODE && - RV.getNode()->getOpcode() != ISD::DELETED_NODE && - "Node was deleted but visit returned new node!"); + // If we get back the same node we passed in, rather than a new node or + // zero, we know that the node must have defined multiple values and + // CombineTo was used. Since CombineTo takes care of the worklist + // mechanics for us, we have no work to do in this case. + if (RV.getNode() == N) + continue; - DEBUG(dbgs() << " ... into: "; - RV.getNode()->dump(&DAG)); + assert(N->getOpcode() != ISD::DELETED_NODE && + RV.getNode()->getOpcode() != ISD::DELETED_NODE && + "Node was deleted but visit returned new node!"); + + DEBUG(dbgs() << " ... into: "; + RV.getNode()->dump(&DAG)); + + // Transfer debug value. + DAG.TransferDbgValues(SDValue(N, 0), RV); + if (N->getNumValues() == RV.getNode()->getNumValues()) + DAG.ReplaceAllUsesWith(N, RV.getNode()); + else { + assert(N->getValueType(0) == RV.getValueType() && + N->getNumValues() == 1 && "Type mismatch"); + SDValue OpV = RV; + DAG.ReplaceAllUsesWith(N, &OpV); + } - // Transfer debug value. - DAG.TransferDbgValues(SDValue(N, 0), RV); - if (N->getNumValues() == RV.getNode()->getNumValues()) - DAG.ReplaceAllUsesWith(N, RV.getNode()); - else { - assert(N->getValueType(0) == RV.getValueType() && - N->getNumValues() == 1 && "Type mismatch"); - SDValue OpV = RV; - DAG.ReplaceAllUsesWith(N, &OpV); + // Push the new node and any users onto the worklist + AddToWorklist(RV.getNode()); + AddUsersToWorklist(RV.getNode()); } - // Push the new node and any users onto the worklist - AddToWorklist(RV.getNode()); - AddUsersToWorklist(RV.getNode()); - // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. This will also take care of adding any Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2565,14 +2565,9 @@ ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - if (Subtarget->isThumb()) { - SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops); - } else { - SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0, - Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops); - } + unsigned Opc = (Subtarget->isThumb() ? ARM::t2ADDrs : ARM::ADDrsi); + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); } if (isPowerOf2_32(RHSV+1)) { // 2^n-1? unsigned ShImm = Log2_32(RHSV+1); @@ -2582,14 +2577,9 @@ ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm); SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - if (Subtarget->isThumb()) { - SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops); - } else { - SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0, - Reg0 }; - return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops); - } + unsigned Opc = (Subtarget->isThumb() ? ARM::t2RSBrs : ARM::RSBrsi); + SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 }; + return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops); } } break; Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -984,6 +984,7 @@ SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFMA(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFormalArguments(SDValue Chain, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -694,7 +694,7 @@ setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FMA, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); @@ -1114,12 +1114,12 @@ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { - setOperationAction(ISD::FMA, MVT::v8f32, Legal); - setOperationAction(ISD::FMA, MVT::v4f64, Legal); - setOperationAction(ISD::FMA, MVT::v4f32, Legal); - setOperationAction(ISD::FMA, MVT::v2f64, Legal); - setOperationAction(ISD::FMA, MVT::f32, Legal); - setOperationAction(ISD::FMA, MVT::f64, Legal); + setOperationAction(ISD::FMA, MVT::v8f32, Custom); + setOperationAction(ISD::FMA, MVT::v4f64, Custom); + setOperationAction(ISD::FMA, MVT::v4f32, Custom); + setOperationAction(ISD::FMA, MVT::v2f64, Custom); + setOperationAction(ISD::FMA, MVT::f32, Custom); + setOperationAction(ISD::FMA, MVT::f64, Custom); } if (Subtarget->hasInt256()) { @@ -1285,8 +1285,8 @@ setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); - setOperationAction(ISD::FMA, MVT::v8f64, Legal); - setOperationAction(ISD::FMA, MVT::v16f32, Legal); + setOperationAction(ISD::FMA, MVT::v8f64, Custom); + setOperationAction(ISD::FMA, MVT::v16f32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); @@ -1588,7 +1588,6 @@ setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); - setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); @@ -12358,6 +12357,14 @@ if (User->getOpcode() == ISD::FNEG) return Op; + // If this is a FNEG and it has an FMA user, bail out to lower the combination + // into an X86ISD::FMSUB or X86ISD::FNMSUB. + bool IsFNEG = (Op.getOpcode() == ISD::FNEG); + if (IsFNEG) + for (SDNode *User : Op->uses()) + if (User->getOpcode() == ISD::FMA || User->getOpcode() == ISD::FMAD) + return Op; + SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); @@ -17606,6 +17613,45 @@ return NOOP; } +SDValue X86TargetLowering::LowerFMA(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getNode()->getValueType(0); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + EVT ScalarVT = VT.getScalarType(); + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || + (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) + return SDValue(); + + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + SDValue C = Op.getOperand(2); + + bool NegA = (A.getOpcode() == ISD::FNEG); + bool NegB = (B.getOpcode() == ISD::FNEG); + bool NegC = (C.getOpcode() == ISD::FNEG); + + // Negative multiplication when NegA xor NegB + bool NegMul = (NegA != NegB); + if (NegA) + A = A.getOperand(0); + if (NegB) + B = B.getOperand(0); + if (NegC) + C = C.getOperand(0); + + unsigned Opcode; + if (!NegMul) + Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; + else + Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + + return DAG.getNode(Opcode, dl, VT, A, B, C); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -17700,6 +17746,7 @@ case ISD::GC_TRANSITION_START: return LowerGC_TRANSITION_START(Op, DAG); case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); + case ISD::FMA: return LowerFMA(Op, DAG); } } @@ -23828,46 +23875,6 @@ return SDValue(); } -static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget* Subtarget) { - SDLoc dl(N); - EVT VT = N->getValueType(0); - - // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) - return SDValue(); - - EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) - return SDValue(); - - SDValue A = N->getOperand(0); - SDValue B = N->getOperand(1); - SDValue C = N->getOperand(2); - - bool NegA = (A.getOpcode() == ISD::FNEG); - bool NegB = (B.getOpcode() == ISD::FNEG); - bool NegC = (C.getOpcode() == ISD::FNEG); - - // Negative multiplication when NegA xor NegB - bool NegMul = (NegA != NegB); - if (NegA) - A = A.getOperand(0); - if (NegB) - B = B.getOperand(0); - if (NegC) - C = C.getOperand(0); - - unsigned Opcode; - if (!NegMul) - Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; - else - Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; - - return DAG.getNode(Opcode, dl, VT, A, B, C); -} - static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -24479,7 +24486,6 @@ case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); - case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); case X86ISD::INSERTPS: { Index: test/CodeGen/AArch64/aarch64_f16_be.ll =================================================================== --- test/CodeGen/AArch64/aarch64_f16_be.ll +++ test/CodeGen/AArch64/aarch64_f16_be.ll @@ -32,7 +32,11 @@ ; CHECK-NOT: st1 ; CHECK-BE-LABEL: test_bitcast_v8f16_to_fp128: -; CHECK-BE: st1 +; CHECK-BE: rev64 +; CHECK-BE: ext +; CHECK-BE: rev64 +; CHECK-BE: ext +; CHECK-BE: str %x = alloca fp128, align 16 %y = bitcast <8 x half> %a to fp128 @@ -58,7 +62,9 @@ ; CHECK-NOT: st1 ; CHECK-BE-LABEL: test_bitcast_v4f16_to_v1f64: -; CHECK-BE: st1 +; CHECK-BE: rev64 +; CHECK-BE: rev64 +; CHECK-BE: str %x = alloca <1 x double>, align 8 %y = bitcast <4 x half> %a to <1 x double> Index: test/CodeGen/AArch64/and-mask-removal.ll =================================================================== --- test/CodeGen/AArch64/and-mask-removal.ll +++ test/CodeGen/AArch64/and-mask-removal.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -fast-isel=false -mtriple=arm64-apple-darwin < %s | FileCheck %s +; RUN: llc -O1 -fast-isel=false -mtriple=arm64-apple-darwin < %s | FileCheck %s @board = common global [400 x i8] zeroinitializer, align 1 @next_string = common global i32 0, align 4 Index: test/CodeGen/ARM/Windows/alloca.ll =================================================================== --- test/CodeGen/ARM/Windows/alloca.ll +++ test/CodeGen/ARM/Windows/alloca.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s +; RUN: llc -O1 -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s declare arm_aapcs_vfpcc i32 @num_entries() Index: test/CodeGen/ARM/alloc-no-stack-realign.ll =================================================================== --- test/CodeGen/ARM/alloc-no-stack-realign.ll +++ test/CodeGen/ARM/alloc-no-stack-realign.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=NO-REALIGN -; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=REALIGN +; RUN: llc < %s -mtriple=armv7-apple-ios -O1 | FileCheck %s -check-prefix=NO-REALIGN +; RUN: llc < %s -mtriple=armv7-apple-ios -O1 | FileCheck %s -check-prefix=REALIGN ; rdar://12713765 ; When realign-stack is set to false, make sure we are not creating stack @@ -11,25 +11,27 @@ ; NO-REALIGN-LABEL: test1 ; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]] ; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! -; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 -; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48 + +; NO-REALIGN: add r[[R9:[0-9]+]], r[[R1]], #32 +; NO-REALIGN: add r[[R1]], r[[R1]], #48 +; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R9]]:128] ; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]! +; NO-REALIGN: add r[[R1:[0-9]+]], r[[R3:[0-9]+]], #48 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; NO-REALIGN: add r[[R1:[0-9]+]], r[[R3:[0-9]+]], #32 +; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]! +; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32 -; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; NO-REALIGN: add r[[R1:[0-9]+]], r[[R0:0]], #48 +; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; NO-REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32 +; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] ; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]! ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128] + %retval = alloca <16 x float>, align 16 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 store <16 x float> %0, <16 x float>* %retval @@ -44,19 +46,19 @@ ; REALIGN: bfc sp, #0, #6 ; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]] ; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]! -; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32 -; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48 +; REALIGN: add r[[R9:[0-9]+]], r[[R1]], #32 +; REALIGN: add r[[R1:[0-9]+]], r[[R1]], #48 +; REALIGN: mov r[[R3:[0-9]+]], sp +; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R9]]:128] ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] - -; REALIGN: orr r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #32 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] -; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #16 -; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128] +; REALIGN: orr r[[R1:[0-9]+]], r[[R3:[0-9]+]], #48 +; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; REALIGN: orr r[[R1:[0-9]+]], r[[R3]], #32 +; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] +; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128] +; REALIGN: orr r[[R1:[0-9]+]], r[[R3]], #16 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128] ; REALIGN: add r[[R1:[0-9]+]], r[[R0:0]], #48 Index: test/CodeGen/ARM/big-endian-ret-f64.ll =================================================================== --- test/CodeGen/ARM/big-endian-ret-f64.ll +++ test/CodeGen/ARM/big-endian-ret-f64.ll @@ -3,8 +3,8 @@ define double @fn() { ; CHECK-LABEL: fn -; CHECK: ldr r0, [sp] -; CHECK: ldr r1, [sp, #4] +; CHECK: vldr [[REG:d[0-9]+]], [sp] +; CHECK: vmov r1, r0, [[REG]] %r = alloca double, align 8 %1 = load double, double* %r, align 8 ret double %1 Index: test/CodeGen/ARM/vst3.ll =================================================================== --- test/CodeGen/ARM/vst3.ll +++ test/CodeGen/ARM/vst3.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=arm-eabi -mattr=+neon -fast-isel=0 -O0 %s -o - | FileCheck %s +; RUN: llc -mtriple=arm-eabi -mattr=+neon -fast-isel=0 -O1 %s -o - | FileCheck %s define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind { ;CHECK-LABEL: vst3i8: Index: test/CodeGen/X86/atomic16.ll =================================================================== --- test/CodeGen/X86/atomic16.ll +++ test/CodeGen/X86/atomic16.ll @@ -70,10 +70,10 @@ ; X32: lock ; X32: andw $3 %t2 = atomicrmw and i16* @sc16, i16 5 acquire -; X64: andl +; X64: andw ; X64: lock ; X64: cmpxchgw -; X32: andl +; X32: andw ; X32: lock ; X32: cmpxchgw %t3 = atomicrmw and i16* @sc16, i16 %t2 acquire @@ -95,10 +95,10 @@ ; X32: lock ; X32: orw $3 %t2 = atomicrmw or i16* @sc16, i16 5 acquire -; X64: orl +; X64: orw ; X64: lock ; X64: cmpxchgw -; X32: orl +; X32: orw ; X32: lock ; X32: cmpxchgw %t3 = atomicrmw or i16* @sc16, i16 %t2 acquire @@ -120,10 +120,10 @@ ; X32: lock ; X32: xorw $3 %t2 = atomicrmw xor i16* @sc16, i16 5 acquire -; X64: xorl +; X64: xorw ; X64: lock ; X64: cmpxchgw -; X32: xorl +; X32: xorw ; X32: lock ; X32: cmpxchgw %t3 = atomicrmw xor i16* @sc16, i16 %t2 acquire @@ -140,12 +140,12 @@ ; X64-LABEL: atomic_fetch_nand16 ; X32-LABEL: atomic_fetch_nand16 %t1 = atomicrmw nand i16* @sc16, i16 %x acquire -; X64: andl -; X64: notl +; X64: andw +; X64: notw ; X64: lock ; X64: cmpxchgw -; X32: andl -; X32: notl +; X32: andw +; X32: notw ; X32: lock ; X32: cmpxchgw ret void Index: test/CodeGen/X86/atomic32.ll =================================================================== --- test/CodeGen/X86/atomic32.ll +++ test/CodeGen/X86/atomic32.ll @@ -111,7 +111,9 @@ ; WITH-CMOV: cmpxchgl ; NOCMOV: subl -; NOCMOV: jge +; NOCMOV: setg [[REG:%[a-z]+]] +; NOCMOV: testb $1, [[REG]] +; NOCMOV: jne ; NOCMOV: lock ; NOCMOV: cmpxchgl ret void @@ -130,7 +132,9 @@ ; WITH-CMOV: cmpxchgl ; NOCMOV: subl -; NOCMOV: jle +; NOCMOV: setle [[REG:%[a-z]+]] +; NOCMOV: testb $1, [[REG]] +; NOCMOV: jne ; NOCMOV: lock ; NOCMOV: cmpxchgl ret void @@ -149,7 +153,9 @@ ; WITH-CMOV: cmpxchgl ; NOCMOV: subl -; NOCMOV: ja +; NOCMOV: seta [[REG:%[a-z]+]] +; NOCMOV: testb $1, [[REG]] +; NOCMOV: jne ; NOCMOV: lock ; NOCMOV: cmpxchgl ret void @@ -168,7 +174,9 @@ ; WITH-CMOV: cmpxchgl ; NOCMOV: subl -; NOCMOV: jb +; NOCMOV: setbe [[REG:%[a-z]+]] +; NOCMOV: testb $1, [[REG]] +; NOCMOV: jne ; NOCMOV: lock ; NOCMOV: cmpxchgl ret void Index: test/CodeGen/X86/atomic6432.ll =================================================================== --- test/CodeGen/X86/atomic6432.ll +++ test/CodeGen/X86/atomic6432.ll @@ -32,18 +32,18 @@ define void @atomic_fetch_sub64() nounwind { ; X32-LABEL: atomic_fetch_sub64: %t1 = atomicrmw sub i64* @sc64, i64 1 acquire -; X32: addl $-1 -; X32: adcl $-1 +; X32: subl $1 +; X32: sbbl $0 ; X32: lock ; X32: cmpxchg8b %t2 = atomicrmw sub i64* @sc64, i64 3 acquire -; X32: addl $-3 -; X32: adcl $-1 +; X32: subl $3 +; X32: sbbl $0 ; X32: lock ; X32: cmpxchg8b %t3 = atomicrmw sub i64* @sc64, i64 5 acquire -; X32: addl $-5 -; X32: adcl $-1 +; X32: subl $5 +; X32: sbbl $0 ; X32: lock ; X32: cmpxchg8b %t4 = atomicrmw sub i64* @sc64, i64 %t3 acquire Index: test/CodeGen/X86/dag-optnone.ll =================================================================== --- test/CodeGen/X86/dag-optnone.ll +++ test/CodeGen/X86/dag-optnone.ll @@ -1,33 +1,18 @@ -; RUN: llc < %s -mtriple=x86_64-pc-win32 -O0 -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-win32 -O0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=OPT +; RUN: llc < %s -mtriple=x86_64-pc-win32 -O1 -mattr=+avx | FileCheck %s --check-prefix=OPT ; Background: -; If fast-isel bails out to normal selection, then the DAG combiner will run, -; even at -O0. In principle this should not happen (those are optimizations, -; and we said -O0) but as a practical matter there are some instruction -; selection patterns that depend on the legalizations and transforms that the -; DAG combiner does. +; If fast-isel bails out to normal selection, then the DAG combiner should not +; run at -O0. ; ; The 'optnone' attribute implicitly sets -O0 and fast-isel for the function. -; The DAG combiner was disabled for 'optnone' (but not -O0) by r221168, then -; re-enabled in r233153 because of problems with instruction selection patterns -; mentioned above. (Note: because 'optnone' is supposed to match -O0, r221168 -; really should have disabled the combiner for both.) ; -; If instruction selection eventually becomes smart enough to run without DAG -; combiner, then the combiner can be turned off for -O0 (not just 'optnone') -; and this test can go away. (To be replaced by a different test that verifies -; the DAG combiner does *not* run at -O0 or for 'optnone' functions.) -; -; In the meantime, this test wants to make sure the combiner stays enabled for -; 'optnone' functions, just as it is for -O0. - - -; The test cases @foo[WithOptnone] prove that the same DAG combine happens -; with -O0 and with 'optnone' set. To prove this, we use a Windows triple to +; The test cases @foo[WithOptnone] prove that no DAG combine happens with +; -O0 and with 'optnone' set. To prove this, we use a Windows triple to ; cause fast-isel to bail out (because something about the calling convention ; is not handled in fast-isel). Then we have a repeated fadd that can be -; combined into an fmul. We show that this happens in both the non-optnone -; function and the optnone function. +; combined into an fmul. We show that this does not happen in both the +; non-optnone function and the optnone function. define float @foo(float %x) #0 { entry: @@ -37,8 +22,9 @@ } ; CHECK-LABEL: @foo -; CHECK-NOT: add -; CHECK: mul +; CHECK-NOT: mul +; CHECK: add +; CHECK: add ; CHECK-NEXT: ret define float @fooWithOptnone(float %x) #1 { @@ -48,10 +34,11 @@ ret float %add1 } -; CHECK-LABEL: @fooWithOptnone -; CHECK-NOT: add -; CHECK: mul -; CHECK-NEXT: ret +; OPT-LABEL: @fooWithOptnone +; OPT-NOT: mul +; OPT: add +; OPT: add +; OPT-NEXT: ret ; The test case @bar is derived from an instruction selection failure case Index: test/CodeGen/X86/fast-isel-gep.ll =================================================================== --- test/CodeGen/X86/fast-isel-gep.ll +++ test/CodeGen/X86/fast-isel-gep.ll @@ -67,7 +67,8 @@ ret double %tmp2 ; X32-LABEL: test4: -; X32: 128(%e{{.*}},%e{{.*}},8) +; X32: addl $16, [[REG:%e[a-z]+]] +; X32: (%e{{.*}},[[REG]],8) ; X64-LABEL: test4: ; X64: 128(%r{{.*}},%r{{.*}},8) } Index: test/CodeGen/X86/fma-no-dag-combine.ll =================================================================== --- test/CodeGen/X86/fma-no-dag-combine.ll +++ test/CodeGen/X86/fma-no-dag-combine.ll @@ -0,0 +1,14 @@ +; Check if an fma intrinsic is properly lowered to a target specific fma instruction when +; DAGCombine is disabled by -O0. + +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -O0 -mattr=+fma | FileCheck %s + +declare <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) + +; CHECK: test_fma_no_combine +; CHECK: vfmadd213ps +; CHECK: ret +define <4 x float> @test_fma_no_combine(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) + ret <4 x float> %res +} Index: test/CodeGen/X86/inline-asm-tied.ll =================================================================== --- test/CodeGen/X86/inline-asm-tied.ll +++ test/CodeGen/X86/inline-asm-tied.ll @@ -1,8 +1,9 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic -no-integrated-as | FileCheck %s ; rdar://6992609 -; CHECK: movl [[EDX:%e..]], 4(%esp) -; CHECK: movl [[EDX]], 4(%esp) +; CHECK: movl [[REG1:%e..]], 4(%esp) +; CHECK: movl 4(%esp), [[REG2:%e..]] +; CHECK: movl [[REG2]], 4(%esp) target triple = "i386-apple-darwin9.0" @llvm.used = appending global [1 x i8*] [i8* bitcast (i64 (i64)* @_OSSwapInt64 to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0] Index: test/CodeGen/X86/musttail.ll =================================================================== --- test/CodeGen/X86/musttail.ll +++ test/CodeGen/X86/musttail.ll @@ -45,7 +45,8 @@ define i32 @t4({}* %fn, i32 %n, i32 %r) { ; CHECK-LABEL: t4: ; CHECK: incl %[[r:.*]] -; CHECK: decl %[[n:.*]] +; CHECK: {{decl|subl}} +; CHECK-SAME: %[[n:.*]] ; CHECK: movl %[[r]], {{[0-9]+}}(%esp) ; CHECK: movl %[[n]], {{[0-9]+}}(%esp) ; CHECK: jmpl *%{{.*}} @@ -69,7 +70,8 @@ ; CHECK: movl %esp, %esi ; Modify the args. ; CHECK: incl %[[r:.*]] -; CHECK: decl %[[n:.*]] +; CHECK: {{decl|subl}} +; CHECK-SAME: %[[n:.*]] ; Store them through ebp, since that's the only stable arg pointer. ; CHECK: movl %[[r]], {{[0-9]+}}(%ebp) ; CHECK: movl %[[n]], {{[0-9]+}}(%ebp) Index: test/CodeGen/X86/switch.ll =================================================================== --- test/CodeGen/X86/switch.ll +++ test/CodeGen/X86/switch.ll @@ -19,13 +19,21 @@ ; Should be lowered as straight compares in -O0 mode. ; NOOPT-LABEL: basic ; NOOPT: subl $1, %eax -; NOOPT: je +; NOOPT: sete [[R1:%.+]] +; NOOPT: testb $1, [[R1]] +; NOOPT: jne ; NOOPT: subl $3, %eax -; NOOPT: je +; NOOPT: sete [[R1]] +; NOOPT: testb $1, [[R1]] +; NOOPT: jne ; NOOPT: subl $4, %eax -; NOOPT: je +; NOOPT: sete [[R1]] +; NOOPT: testb $1, [[R1]] +; NOOPT: jne ; NOOPT: subl $5, %eax -; NOOPT: je +; NOOPT: sete [[R1]] +; NOOPT: testb $1, [[R1]] +; NOOPT: jne ; Jump table otherwise. ; CHECK-LABEL: basic @@ -62,11 +70,15 @@ ; We do this even at -O0, because it's cheap and makes codegen faster. ; NOOPT-LABEL: simple_ranges -; NOOPT: subl $4 -; NOOPT: jb -; NOOPT: addl $-100 -; NOOPT: subl $4 -; NOOPT: jb +; NOOPT: subl $3 +; NOOPT: setbe [[R1:%.+]] +; NOOPT: testb $1, [[R1]] +; NOOPT: jne +; NOOPT: subl $100 +; NOOPT: subl $3 +; NOOPT: setbe [[R1]] +; NOOPT: testb $1, [[R1]] +; NOOPT: jne } Index: test/CodeGen/X86/win32_sret.ll =================================================================== --- test/CodeGen/X86/win32_sret.ll +++ test/CodeGen/X86/win32_sret.ll @@ -1,10 +1,10 @@ ; We specify -mcpu explicitly to avoid instruction reordering that happens on ; some setups (e.g., Atom) from affecting the output. -; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32 +; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-NO-O0 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-cygwin | FileCheck %s -check-prefix=CYGWIN ; RUN: llc < %s -mcpu=core2 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX -; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32 +; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-O0 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-cygwin | FileCheck %s -check-prefix=CYGWIN ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX @@ -116,11 +116,11 @@ ; LINUX-LABEL: {{^}}"?foo@C5@@QAE?AUS5@@XZ": ; The address of the return structure is passed as an implicit parameter. -; In the -O0 build, %eax is spilled at the beginning of the function, hence we -; should match both 4(%esp) and 8(%esp). -; WIN32: {{[48]}}(%esp), %eax -; WIN32: movl $42, (%eax) -; WIN32: retl $4 +; WIN32-NO-O0: {{[48]}}(%esp), %eax +; WIN32-O0: subl ${{[0-9]+}}, %esp +; WIN32-O0: {{[0-9]+}}(%esp), %eax +; WIN32: movl $42, (%eax) +; WIN32: retl $4 } define void @call_foo5() { Index: test/CodeGen/X86/win64_eh.ll =================================================================== --- test/CodeGen/X86/win64_eh.ll +++ test/CodeGen/X86/win64_eh.ll @@ -48,7 +48,7 @@ ; Checks stack push -define i32 @foo3(i32 %f_arg, i32 %e_arg, i32 %d_arg, i32 %c_arg, i32 %b_arg, i32 %a_arg) uwtable { +define i32 @foo3(i32 %g_arg, i32 %f_arg, i32 %e_arg, i32 %d_arg, i32 %c_arg, i32 %b_arg, i32 %a_arg) uwtable { entry: %a = alloca i32 %b = alloca i32 @@ -56,12 +56,14 @@ %d = alloca i32 %e = alloca i32 %f = alloca i32 + %g = alloca i32 store i32 %a_arg, i32* %a store i32 %b_arg, i32* %b store i32 %c_arg, i32* %c store i32 %d_arg, i32* %d store i32 %e_arg, i32* %e store i32 %f_arg, i32* %f + store i32 %g_arg, i32* %g %tmp = load i32, i32* %a %tmp1 = mul i32 %tmp, 2 %tmp2 = load i32, i32* %b @@ -71,7 +73,7 @@ %tmp6 = mul i32 %tmp5, 5 %tmp7 = add i32 %tmp4, %tmp6 %tmp8 = load i32, i32* %d - %tmp9 = mul i32 %tmp8, 7 + %tmp9 = mul i32 %tmp8, 9 %tmp10 = add i32 %tmp7, %tmp9 %tmp11 = load i32, i32* %e %tmp12 = mul i32 %tmp11, 11 @@ -85,11 +87,11 @@ ; WIN64: .seh_proc foo3 ; WIN64: pushq %rsi ; WIN64: .seh_pushreg 6 -; NORM: subq $24, %rsp -; ATOM: leaq -24(%rsp), %rsp -; WIN64: .seh_stackalloc 24 +; NORM: subq $32, %rsp +; ATOM: leaq -32(%rsp), %rsp +; WIN64: .seh_stackalloc 32 ; WIN64: .seh_endprologue -; WIN64: addq $24, %rsp +; WIN64: addq $32, %rsp ; WIN64: popq %rsi ; WIN64: ret ; WIN64: .seh_endproc Index: test/CodeGen/XCore/threads.ll =================================================================== --- test/CodeGen/XCore/threads.ll +++ test/CodeGen/XCore/threads.ll @@ -123,7 +123,9 @@ define void @phiNode2( i1 %bool) { ; N.B. check an extra 'Node_crit_edge' (LBB12_1) is inserted ; PHINODE-LABEL: phiNode2: -; PHINODE: bf {{r[0-9]}}, .LBB12_3 +; PHINODE: mkmsk [[MASK_REG:r[0-9]+]], 1 +; PHINODE: xor [[REG:r[0-9]+]], [[REG]], [[MASK_REG]] +; PHINODE: bt [[REG]], .LBB12_3 ; PHINODE: bu .LBB12_1 ; PHINODE-LABEL: .LBB12_1: ; PHINODE: get r11, id Index: test/DebugInfo/ARM/line.test =================================================================== --- test/DebugInfo/ARM/line.test +++ test/DebugInfo/ARM/line.test @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=arm-none-linux -O0 -filetype=asm < %S/../Inputs/line.ll | FileCheck %S/../Inputs/line.ll +; RUN: llc -mtriple=arm-none-linux -O1 -filetype=asm < %S/../Inputs/line.ll | FileCheck %S/../Inputs/line.ll ; This is more complex than it looked. It's mixed up somewhere in SelectionDAG ; (legalized as br_cc, losing the separation between the comparison and the Index: test/DebugInfo/X86/op_deref.ll =================================================================== --- test/DebugInfo/X86/op_deref.ll +++ test/DebugInfo/X86/op_deref.ll @@ -20,8 +20,8 @@ ; right now, so we check the asm output: ; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o - -filetype=asm | FileCheck %s -check-prefix=ASM-CHECK ; vla should have a register-indirect address at one point. -; ASM-CHECK: DEBUG_VALUE: vla <- RCX -; ASM-CHECK: DW_OP_breg2 +; ASM-CHECK: DEBUG_VALUE: vla <- RDX +; ASM-CHECK: DW_OP_breg1 ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s --check-prefix=PRETTY-PRINT ; PRETTY-PRINT: DIExpression(DW_OP_deref, DW_OP_deref) Index: test/DebugInfo/X86/vla.ll =================================================================== --- test/DebugInfo/X86/vla.ll +++ test/DebugInfo/X86/vla.ll @@ -1,7 +1,7 @@ ; RUN: llc -O0 -mtriple=x86_64-apple-darwin -filetype=asm %s -o - | FileCheck %s ; Ensure that we generate an indirect location for the variable length array a. -; CHECK: ##DEBUG_VALUE: vla:a <- RDX -; CHECK: DW_OP_breg1 +; CHECK: ##DEBUG_VALUE: vla:a <- RSI +; CHECK: DW_OP_breg4 ; rdar://problem/13658587 ; ; generated from: Index: test/tools/llvm-symbolizer/ppc64.test =================================================================== --- test/tools/llvm-symbolizer/ppc64.test +++ test/tools/llvm-symbolizer/ppc64.test @@ -4,7 +4,7 @@ int bar() { return foo(); } int _start() { return bar(); } -RUN: %python -c "print('0x1000014c\n0x1000018c\n0x100001cc')" | llvm-symbolizer -obj=%p/Inputs/ppc64 | FileCheck %s +RUN: "%python" -c "print('0x1000014c\n0x1000018c\n0x100001cc')" | llvm-symbolizer -obj=%p/Inputs/ppc64 | FileCheck %s CHECK: foo CHECK: bar