Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -2985,17 +2985,39 @@ /// virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + // Returns true if the target has a pattern + // to extract N bits starting with M'th bit. + // On x86 that could be BMI1's BEXTR, on AArch64 it is UBFX. + virtual bool haveBitFieldExtractPattern(EVT VT) const { return false; } + + // True if N is (X >> C1) & C2 where C2 is a mask with all-ones in low bits. + virtual bool isBitFieldExtractPattern(const SDNode *N) const { + if (!haveBitFieldExtractPattern(N->getValueType(0))) + return false; + if (N->getOpcode() != ISD::AND) + return false; + const auto *AndCstMask = dyn_cast(N->getOperand(1)); + if (!AndCstMask) + return false; + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::SRL || !isa(N0.getOperand(1))) + return false; + // Ok, if the mask is all-ones in low bits, then it is 'bit field extract'. + return AndCstMask->getAPIntValue().isMask(); + } + /// Return true if it is profitable to move this shift by a constant amount /// though its operand, adjusting any immediate operands as necessary to /// preserve semantics. This transformation may not be desirable if it - /// disrupts a particularly auspicious target-specific tree (e.g. bitfield - /// extraction in AArch64). By default, it returns true. + /// disrupts a particularly suspicious target-specific tree (e.g. bitfield + /// extraction in x86/AArch64). /// /// @param N the shift node /// @param Level the current DAGCombine legalization level. virtual bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const { - return true; + // Desirable if it will not disturb the 'bit field extract' pattern. + return !isBitFieldExtractPattern(N->getOperand(0).getNode()); } // Return true if it is profitable to combine a BUILD_VECTOR with a stride-pattern Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -413,6 +413,7 @@ const SDLoc &DL); SDValue unfoldMaskedMerge(SDNode *N); SDValue unfoldExtremeBitClearingToShifts(SDNode *N); + SDValue formBitFieldExtractFromShiftedAnd(SDNode *N); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans); SDValue rebuildSetCC(SDValue N); @@ -1039,11 +1040,17 @@ /// Check the specified integer node value to see if it can be simplified or if /// things it uses can be simplified by bit propagation. If so, return true. bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { + if (TLI.isBitFieldExtractPattern(Op.getNode())) + return false; + TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); KnownBits Known; if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO)) return false; + if (TLI.isBitFieldExtractPattern(TLO.Old.getNode())) + return false; + // Revisit the node. AddToWorklist(Op.getNode()); @@ -4411,6 +4418,66 @@ return T1; } +SDValue DAGCombiner::formBitFieldExtractFromShiftedAnd(SDNode *N) { + assert(N->getOpcode() == ISD::AND); + + EVT VT = N->getValueType(0); + + // If the target does not support 'bit field extract' pattern, nothing to do. + if (!TLI.haveBitFieldExtractPattern(VT)) + return SDValue(); + + // Look for (X l>> C1) & (C2 << C3) (where C2 is all-ones in lowbits) + // and transform into ((X l>> C4) & C2) << C3, where C4 = C1 + C3 + // But only if C4 < bitwidth(X) i.e. we do not produce undefined shift. + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + const auto *AndConstMask = dyn_cast(N1); + if (!AndConstMask) + return SDValue(); + + if (N0.getOpcode() != ISD::SRL || !N0.hasOneUse()) + return SDValue(); + + const auto *ShiftConstAmt = dyn_cast(N0.getOperand(1)); + if (!ShiftConstAmt) + return SDValue(); + + SDValue X = N0.getOperand(0); + const APInt &C1 = ShiftConstAmt->getAPIntValue(); + const APInt &Mask = AndConstMask->getAPIntValue(); + + // The mask should be shifted, i.e. 0b1100, but not 0b11 or 0b101 + if (!Mask.isShiftedMask() || Mask.isMask()) + return SDValue(); + + uint64_t C3 = Mask.countTrailingZeros(); + assert(C3 > 0 && "we were looking for *shifted* mask"); + + const APInt C4 = C1 + C3; + if (C4.uge(VT.getScalarSizeInBits())) + return SDValue(); + + const APInt C2 = Mask.lshr(C3); + assert(C2.isMask() && C2.countPopulation() == Mask.countPopulation()); + + SDLoc DL(N); + + SDValue C4v = DAG.getConstant(C4.getZExtValue(), DL, MVT::i8); + SDValue C2v = DAG.getConstant(C2, DL, VT); + SDValue C3v = DAG.getConstant(C3, DL, MVT::i8); + + SDValue NewX = DAG.getNode(ISD::SRL, DL, VT, X, C4v); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewX, C2v); + assert(TLI.isBitFieldExtractPattern(NewAnd.getNode()) && + "we should have formed an 'bit field extract' pattern. " + "If TLI hook can't recognize it, we will have endless loops."); + + return DAG.getNode(ISD::SHL, DL, VT, NewAnd, C3v); +} + SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4711,6 +4778,9 @@ if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N)) return Shifts; + if (SDValue V = formBitFieldExtractFromShiftedAnd(N)) + return V; + return SDValue(); } @@ -6399,7 +6469,8 @@ return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); } - if (N1C && SimplifyDemandedBits(SDValue(N, 0))) + if (N1C && !TLI.isBitFieldExtractPattern(N0.getNode()) && + SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2)) Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -363,9 +363,10 @@ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; - /// Returns false if N is a bit extraction pattern of (X >> C) & Mask. - bool isDesirableToCommuteWithShift(const SDNode *N, - CombineLevel Level) const override; + bool haveBitFieldExtractPattern(EVT VT) const override { + // UBFX is valid for i32 and i64 types. + return VT == MVT::i32 || VT == MVT::i64; + } /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8525,24 +8525,6 @@ return ScratchRegs; } -bool -AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, - CombineLevel Level) const { - N = N->getOperand(0).getNode(); - EVT VT = N->getValueType(0); - // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine - // it with shift to let it be lowered to UBFX. - if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && - isa(N->getOperand(1))) { - uint64_t TruncMask = N->getConstantOperandVal(1); - if (isMask_64(TruncMask) && - N->getOperand(0).getOpcode() == ISD::SRL && - isa(N->getOperand(0)->getOperand(1))) - return false; - } - return true; -} - bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -1034,6 +1034,8 @@ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } + bool haveBitFieldExtractPattern(EVT VT) const override; + /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4701,6 +4701,11 @@ return true; } +bool X86TargetLowering::haveBitFieldExtractPattern(EVT VT) const { + // BEXTR is valid for i32 and i64 types. + return Subtarget.hasBMI() && (VT == MVT::i32 || VT == MVT::i64); +} + /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Index: test/CodeGen/AArch64/expand-select.ll =================================================================== --- test/CodeGen/AArch64/expand-select.ll +++ test/CodeGen/AArch64/expand-select.ll @@ -39,8 +39,8 @@ ; CHECK-NEXT: ldr x9, [sp] ; CHECK-NEXT: dup v1.4s, v0.s[0] ; CHECK-NEXT: mov x10, v1.d[1] -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: tst w10, #0x1 +; CHECK-NEXT: ubfx x10, x10, #32, #1 +; CHECK-NEXT: cmp w10, #0 // =0 ; CHECK-NEXT: fmov w10, s0 ; CHECK-NEXT: csel x11, x5, x11, ne ; CHECK-NEXT: csel x9, x4, x9, ne Index: test/CodeGen/AArch64/selectcc-to-shiftand.ll =================================================================== --- test/CodeGen/AArch64/selectcc-to-shiftand.ll +++ test/CodeGen/AArch64/selectcc-to-shiftand.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s ; Compare if negative and select of constants where one constant is zero. @@ -8,7 +9,6 @@ ; CHECK-NEXT: mov w8, #5 ; CHECK-NEXT: and w0, w8, w0, asr #31 ; CHECK-NEXT: ret -; %tmp.1 = icmp slt i32 %a, 0 %retval = select i1 %tmp.1, i32 5, i32 0 ret i32 %retval @@ -19,10 +19,9 @@ define i32 @neg_sel_special_constant(i32 %a) { ; CHECK-LABEL: neg_sel_special_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #22 -; CHECK-NEXT: and w0, w8, #0x200 +; CHECK-NEXT: lsr w8, w0, #31 +; CHECK-NEXT: lsl w0, w8, #9 ; CHECK-NEXT: ret -; %tmp.1 = icmp slt i32 %a, 0 %retval = select i1 %tmp.1, i32 512, i32 0 ret i32 %retval @@ -35,7 +34,6 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w0, w1, w0, asr #31 ; CHECK-NEXT: ret -; %tmp.1 = icmp slt i32 %a, 0 %retval = select i1 %tmp.1, i32 %b, i32 0 ret i32 %retval @@ -48,7 +46,6 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w0, w0, w0, asr #31 ; CHECK-NEXT: ret -; %tmp = icmp slt i32 %a, 1 %min = select i1 %tmp, i32 %a, i32 0 ret i32 %min @@ -64,7 +61,6 @@ ; CHECK-NEXT: mov w8, #5 ; CHECK-NEXT: bic w0, w8, w0, asr #31 ; CHECK-NEXT: ret -; %tmp.1 = icmp sgt i32 %a, -1 %retval = select i1 %tmp.1, i32 5, i32 0 ret i32 %retval @@ -78,7 +74,6 @@ ; CHECK-NEXT: orr w8, wzr, #0x200 ; CHECK-NEXT: bic w0, w8, w0, lsr #22 ; CHECK-NEXT: ret -; %tmp.1 = icmp sgt i32 %a, -1 %retval = select i1 %tmp.1, i32 512, i32 0 ret i32 %retval @@ -91,7 +86,6 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: bic w0, w1, w0, asr #31 ; CHECK-NEXT: ret -; %tmp.1 = icmp sgt i32 %a, -1 %retval = select i1 %tmp.1, i32 %b, i32 0 ret i32 %retval @@ -104,7 +98,6 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: bic w0, w0, w0, asr #31 ; CHECK-NEXT: ret -; %tmp = icmp sgt i32 %a, 0 %min = select i1 %tmp, i32 %a, i32 0 ret i32 %min @@ -119,7 +112,6 @@ ; CHECK-NEXT: sub w8, w0, w1 ; CHECK-NEXT: bic w0, w8, w8, asr #31 ; CHECK-NEXT: ret -; %sub = sub nsw i32 %x, %y %cmp = icmp sgt i32 %sub, 0 %sel = select i1 %cmp, i32 %sub, i32 0 Index: test/CodeGen/AArch64/signbit-shift.ll =================================================================== --- test/CodeGen/AArch64/signbit-shift.ll +++ test/CodeGen/AArch64/signbit-shift.ll @@ -150,8 +150,9 @@ define i32 @add_sext_ifneg(i32 %x) { ; CHECK-LABEL: add_sext_ifneg: ; CHECK: // %bb.0: -; CHECK-NEXT: asr w8, w0, #31 -; CHECK-NEXT: add w0, w8, #42 // =42 +; CHECK-NEXT: lsr w8, w0, #31 +; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %c = icmp slt i32 %x, 0 %e = sext i1 %c to i32 Index: test/CodeGen/X86/bmi-x86_64.ll =================================================================== --- test/CodeGen/X86/bmi-x86_64.ll +++ test/CodeGen/X86/bmi-x86_64.ll @@ -75,9 +75,9 @@ define i64 @non_bextr64(i64 %x) { ; CHECK-LABEL: non_bextr64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: shrq $2, %rdi -; CHECK-NEXT: movabsq $8589934590, %rax # imm = 0x1FFFFFFFE -; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: movl $8195, %eax # imm = 0x2003 +; CHECK-NEXT: bextrq %rax, %rdi, %rax +; CHECK-NEXT: addq %rax, %rax ; CHECK-NEXT: retq entry: %shr = lshr i64 %x, 2 Index: test/CodeGen/X86/break-false-dep.ll =================================================================== --- test/CodeGen/X86/break-false-dep.ll +++ test/CodeGen/X86/break-false-dep.ll @@ -1,13 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512vl -mcpu=skx | FileCheck %s --check-prefix=AVX define double @t1(float* nocapture %x) nounwind readonly ssp { -entry: ; SSE-LABEL: t1: -; SSE: movss ([[A0:%rdi|%rcx]]), %xmm0 -; SSE: cvtss2sd %xmm0, %xmm0 +; SSE: # %bb.0: # %entry +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: cvtss2sd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: t1: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: %0 = load float, float* %x, align 4 %1 = fpext float %0 to double @@ -15,47 +24,84 @@ } define float @t2(double* nocapture %x) nounwind readonly ssp optsize { -entry: ; SSE-LABEL: t2: -; SSE: cvtsd2ss ([[A0]]), %xmm0 +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsd2ss (%rcx), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: t2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsd2ss (%rcx), %xmm0, %xmm0 +; AVX-NEXT: retq +entry: %0 = load double, double* %x, align 8 %1 = fptrunc double %0 to float ret float %1 } define float @squirtf(float* %x) nounwind { -entry: ; SSE-LABEL: squirtf: -; SSE: movss ([[A0]]), %xmm0 -; SSE: sqrtss %xmm0, %xmm0 +; SSE: # %bb.0: # %entry +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: squirtf: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: %z = load float, float* %x %t = call float @llvm.sqrt.f32(float %z) ret float %t } define double @squirt(double* %x) nounwind { -entry: ; SSE-LABEL: squirt: -; SSE: movsd ([[A0]]), %xmm0 -; SSE: sqrtsd %xmm0, %xmm0 +; SSE: # %bb.0: # %entry +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: sqrtsd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: squirt: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: %z = load double, double* %x %t = call double @llvm.sqrt.f64(double %z) ret double %t } define float @squirtf_size(float* %x) nounwind optsize { -entry: ; SSE-LABEL: squirtf_size: -; SSE: sqrtss ([[A0]]), %xmm0 +; SSE: # %bb.0: # %entry +; SSE-NEXT: sqrtss (%rcx), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: squirtf_size: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vsqrtss (%rcx), %xmm0, %xmm0 +; AVX-NEXT: retq +entry: %z = load float, float* %x %t = call float @llvm.sqrt.f32(float %z) ret float %t } define double @squirt_size(double* %x) nounwind optsize { -entry: ; SSE-LABEL: squirt_size: -; SSE: sqrtsd ([[A0]]), %xmm0 +; SSE: # %bb.0: # %entry +; SSE-NEXT: sqrtsd (%rcx), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: squirt_size: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vsqrtsd (%rcx), %xmm0, %xmm0 +; AVX-NEXT: retq +entry: %z = load double, double* %x %t = call double @llvm.sqrt.f64(double %z) ret double %t @@ -81,6 +127,61 @@ ; SSE: cvtsi2ssl %{{.*}}, [[XMM2]] ; define float @loopdep1(i32 %m) nounwind uwtable readnone ssp { +; SSE-LABEL: loopdep1: +; SSE: # %bb.0: # %entry +; SSE-NEXT: testl %ecx, %ecx +; SSE-NEXT: je .LBB6_1 +; SSE-NEXT: # %bb.2: # %for.body.preheader +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: .p2align 4, 0x90 +; SSE-NEXT: .LBB6_3: # %for.body +; SSE-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: cvtsi2ssl %eax, %xmm2 +; SSE-NEXT: xorps %xmm3, %xmm3 +; SSE-NEXT: cvtsi2ssl %ecx, %xmm3 +; SSE-NEXT: addss %xmm2, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm1 +; SSE-NEXT: incl %eax +; SSE-NEXT: decl %ecx +; SSE-NEXT: jne .LBB6_3 +; SSE-NEXT: # %bb.4: # %for.end +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB6_1: +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: loopdep1: +; AVX: # %bb.0: # %entry +; AVX-NEXT: testl %ecx, %ecx +; AVX-NEXT: je .LBB6_1 +; AVX-NEXT: # %bb.2: # %for.body.preheader +; AVX-NEXT: movl $1, %eax +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB6_3: # %for.body +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm2 +; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ssl %ecx, %xmm3, %xmm2 +; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: addl $1, %eax +; AVX-NEXT: addl $-1, %ecx +; AVX-NEXT: jne .LBB6_3 +; AVX-NEXT: # %bb.4: # %for.end +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; AVX-NEXT: .LBB6_1: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %tobool3 = icmp eq i32 %m, 0 br i1 %tobool3, label %for.end, label %for.body @@ -120,6 +221,95 @@ ; SSE: xorps %[[REG:xmm.]], %[[REG]] ; SSE: cvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]] define i64 @loopdep2(i64* nocapture %x, double* nocapture %y) nounwind { +; SSE-LABEL: loopdep2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: subq $184, %rsp +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movq (%rcx), %rax +; SSE-NEXT: movl $1, %r8d +; SSE-NEXT: .p2align 4, 0x90 +; SSE-NEXT: .LBB7_1: # %loop +; SSE-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sdq %r8, %xmm0 +; SSE-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero +; SSE-NEXT: addsd (%rdx), %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: addq %rcx, %rax +; SSE-NEXT: incq %r8 +; SSE-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; SSE-NEXT: jne .LBB7_1 +; SSE-NEXT: # %bb.2: # %ret +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: addq $184, %rsp +; SSE-NEXT: retq +; +; AVX-LABEL: loopdep2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $184, %rsp +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: movq (%rcx), %rax +; AVX-NEXT: movl $1, %r8d +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB7_1: # %loop +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vcvtsi2sdq %r8, %xmm1, %xmm0 +; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero +; AVX-NEXT: vaddsd (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: addq $1, %r8 +; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX-NEXT: jne .LBB7_1 +; AVX-NEXT: # %bb.2: # %ret +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $184, %rsp +; AVX-NEXT: retq entry: %vx = load i64, i64* %x br label %loop @@ -151,6 +341,161 @@ @v = common global [1024 x i32] zeroinitializer, align 16 define void @loopdep3() { +; SSE-LABEL: loopdep3: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rsi +; SSE-NEXT: .seh_pushreg 6 +; SSE-NEXT: subq $160, %rsp +; SSE-NEXT: .seh_stackalloc 160 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 15, 144 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 14, 128 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 13, 112 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 12, 96 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 11, 80 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 10, 64 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 9, 48 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 8, 32 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 7, 16 +; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 6, 0 +; SSE-NEXT: .seh_endprologue +; SSE-NEXT: xorl %r9d, %r9d +; SSE-NEXT: leaq {{.*}}(%rip), %r8 +; SSE-NEXT: leaq {{.*}}(%rip), %r10 +; SSE-NEXT: leaq {{.*}}(%rip), %r11 +; SSE-NEXT: leaq {{.*}}(%rip), %rax +; SSE-NEXT: leaq {{.*}}(%rip), %rdx +; SSE-NEXT: .p2align 4, 0x90 +; SSE-NEXT: .LBB8_1: # %for.cond1.preheader +; SSE-NEXT: # =>This Loop Header: Depth=1 +; SSE-NEXT: # Child Loop BB8_2 Depth 2 +; SSE-NEXT: movq %r8, %rcx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: .p2align 4, 0x90 +; SSE-NEXT: .LBB8_2: # %for.body3 +; SSE-NEXT: # Parent Loop BB8_1 Depth=1 +; SSE-NEXT: # => This Inner Loop Header: Depth=2 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sdl (%rcx), %xmm0 +; SSE-NEXT: mulsd (%rsi,%r10), %xmm0 +; SSE-NEXT: mulsd (%rsi,%r11), %xmm0 +; SSE-NEXT: mulsd (%rsi,%rax), %xmm0 +; SSE-NEXT: movsd %xmm0, (%rsi,%rdx) +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: addq $8, %rsi +; SSE-NEXT: addq $4, %rcx +; SSE-NEXT: cmpq $8192, %rsi # imm = 0x2000 +; SSE-NEXT: jne .LBB8_2 +; SSE-NEXT: # %bb.3: # %for.inc14 +; SSE-NEXT: # in Loop: Header=BB8_1 Depth=1 +; SSE-NEXT: incl %r9d +; SSE-NEXT: cmpl $100000, %r9d # imm = 0x186A0 +; SSE-NEXT: jne .LBB8_1 +; SSE-NEXT: # %bb.4: # %for.end16 +; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: addq $160, %rsp +; SSE-NEXT: popq %rsi +; SSE-NEXT: retq +; SSE-NEXT: .seh_handlerdata +; SSE-NEXT: .text +; SSE-NEXT: .seh_endproc +; +; AVX-LABEL: loopdep3: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rsi +; AVX-NEXT: .seh_pushreg 6 +; AVX-NEXT: subq $160, %rsp +; AVX-NEXT: .seh_stackalloc 160 +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 15, 144 +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 14, 128 +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 13, 112 +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 12, 96 +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 11, 80 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 10, 64 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 9, 48 +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 8, 32 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 7, 16 +; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 6, 0 +; AVX-NEXT: .seh_endprologue +; AVX-NEXT: xorl %r9d, %r9d +; AVX-NEXT: leaq {{.*}}(%rip), %r8 +; AVX-NEXT: leaq {{.*}}(%rip), %r10 +; AVX-NEXT: leaq {{.*}}(%rip), %r11 +; AVX-NEXT: leaq {{.*}}(%rip), %rax +; AVX-NEXT: leaq {{.*}}(%rip), %rdx +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB8_1: # %for.cond1.preheader +; AVX-NEXT: # =>This Loop Header: Depth=1 +; AVX-NEXT: # Child Loop BB8_2 Depth 2 +; AVX-NEXT: movq %r8, %rcx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB8_2: # %for.body3 +; AVX-NEXT: # Parent Loop BB8_1 Depth=1 +; AVX-NEXT: # => This Inner Loop Header: Depth=2 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sdl (%rcx), %xmm0, %xmm0 +; AVX-NEXT: vmulsd (%rsi,%r10), %xmm0, %xmm0 +; AVX-NEXT: vmulsd (%rsi,%r11), %xmm0, %xmm0 +; AVX-NEXT: vmulsd (%rsi,%rax), %xmm0, %xmm0 +; AVX-NEXT: vmovsd %xmm0, (%rsi,%rdx) +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: addq $8, %rsi +; AVX-NEXT: addq $4, %rcx +; AVX-NEXT: cmpq $8192, %rsi # imm = 0x2000 +; AVX-NEXT: jne .LBB8_2 +; AVX-NEXT: # %bb.3: # %for.inc14 +; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1 +; AVX-NEXT: addl $1, %r9d +; AVX-NEXT: cmpl $100000, %r9d # imm = 0x186A0 +; AVX-NEXT: jne .LBB8_1 +; AVX-NEXT: # %bb.4: # %for.end16 +; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $160, %rsp +; AVX-NEXT: popq %rsi +; AVX-NEXT: retq +; AVX-NEXT: .seh_handlerdata +; AVX-NEXT: .text +; AVX-NEXT: .seh_endproc entry: br label %for.cond1.preheader @@ -187,23 +532,126 @@ for.end16: ; preds = %for.inc14 ret void -;SSE-LABEL:@loopdep3 -;SSE: xorps [[XMM0:%xmm[0-9]+]], [[XMM0]] -;SSE-NEXT: cvtsi2sdl {{.*}}, [[XMM0]] -;SSE-NEXT: mulsd {{.*}}, [[XMM0]] -;SSE-NEXT: mulsd {{.*}}, [[XMM0]] -;SSE-NEXT: mulsd {{.*}}, [[XMM0]] -;SSE-NEXT: movsd [[XMM0]], -;AVX-LABEL:@loopdep3 -;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]] -;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], {{%xmm[0-9]+}} -;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] -;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] -;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] -;AVX-NEXT: vmovsd [[XMM0]], } define double @inlineasmdep(i64 %arg) { +; SSE-LABEL: inlineasmdep: +; SSE: # %bb.0: # %top +; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: .seh_stackalloc 168 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 15, 144 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 14, 128 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 13, 112 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 12, 96 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 11, 80 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 10, 64 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 9, 48 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 8, 32 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 7, 16 +; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 6, 0 +; SSE-NEXT: .seh_endprologue +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sdq %rcx, %xmm0 +; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: retq +; SSE-NEXT: .seh_handlerdata +; SSE-NEXT: .text +; SSE-NEXT: .seh_endproc +; +; AVX-LABEL: inlineasmdep: +; AVX: # %bb.0: # %top +; AVX-NEXT: subq $168, %rsp +; AVX-NEXT: .seh_stackalloc 168 +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 15, 144 +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 14, 128 +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 13, 112 +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 12, 96 +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 11, 80 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 10, 64 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 9, 48 +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 8, 32 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 7, 16 +; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 6, 0 +; AVX-NEXT: .seh_endprologue +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sdq %rcx, %xmm0, %xmm0 +; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $168, %rsp +; AVX-NEXT: retq +; AVX-NEXT: .seh_handlerdata +; AVX-NEXT: .text +; AVX-NEXT: .seh_endproc top: tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() @@ -215,14 +663,136 @@ tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() %tmp1 = sitofp i64 %arg to double ret double %tmp1 -;AVX-LABEL:@inlineasmdep -;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]] -;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}} } ; Make sure we are making a smart choice regarding undef registers and ; hiding the false dependency behind a true dependency define double @truedeps(float %arg) { +; SSE-LABEL: truedeps: +; SSE: # %bb.0: # %top +; SSE-NEXT: subq $184, %rsp +; SSE-NEXT: .seh_stackalloc 184 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 15, 160 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 14, 144 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 13, 128 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 12, 112 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 11, 96 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 10, 80 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 9, 64 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 8, 48 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 7, 32 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 6, 16 +; SSE-NEXT: .seh_endprologue +; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: cvtss2sd %xmm0, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: addq $184, %rsp +; SSE-NEXT: retq +; SSE-NEXT: .seh_handlerdata +; SSE-NEXT: .text +; SSE-NEXT: .seh_endproc +; +; AVX-LABEL: truedeps: +; AVX: # %bb.0: # %top +; AVX-NEXT: subq $184, %rsp +; AVX-NEXT: .seh_stackalloc 184 +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 15, 160 +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 14, 144 +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 13, 128 +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 12, 112 +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 11, 96 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 10, 80 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 9, 64 +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 8, 48 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 7, 32 +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 6, 16 +; AVX-NEXT: .seh_endprologue +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $184, %rsp +; AVX-NEXT: retq +; AVX-NEXT: .seh_handlerdata +; AVX-NEXT: .text +; AVX-NEXT: .seh_endproc top: tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() @@ -235,14 +805,132 @@ tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() %tmp1 = fpext float %arg to double ret double %tmp1 -;AVX-LABEL:@truedeps -;AVX-NOT: vxorps -;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}} } ; Make sure we are making a smart choice regarding undef registers and ; choosing the register with the highest clearence define double @clearence(i64 %arg) { +; SSE-LABEL: clearence: +; SSE: # %bb.0: # %top +; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: .seh_stackalloc 168 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 15, 144 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 14, 128 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 13, 112 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 12, 96 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 11, 80 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 10, 64 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 9, 48 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 8, 32 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 7, 16 +; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 6, 0 +; SSE-NEXT: .seh_endprologue +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sdq %rcx, %xmm0 +; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: retq +; SSE-NEXT: .seh_handlerdata +; SSE-NEXT: .text +; SSE-NEXT: .seh_endproc +; +; AVX-LABEL: clearence: +; AVX: # %bb.0: # %top +; AVX-NEXT: subq $168, %rsp +; AVX-NEXT: .seh_stackalloc 168 +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 15, 144 +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 14, 128 +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 13, 112 +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 12, 96 +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 11, 80 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 10, 64 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 9, 48 +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 8, 32 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 7, 16 +; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 6, 0 +; AVX-NEXT: .seh_endprologue +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vxorps %xmm6, %xmm6, %xmm6 +; AVX-NEXT: vcvtsi2sdq %rcx, %xmm6, %xmm0 +; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $168, %rsp +; AVX-NEXT: retq +; AVX-NEXT: .seh_handlerdata +; AVX-NEXT: .text +; AVX-NEXT: .seh_endproc top: tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() @@ -255,9 +943,6 @@ tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() %tmp1 = sitofp i64 %arg to double ret double %tmp1 -;AVX-LABEL:@clearence -;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] -;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} } ; Make sure we are making a smart choice regarding undef registers in order to @@ -265,6 +950,104 @@ ; iteration, especially when we cannot zero out the undef register because it ; is alive. define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind { +; SSE-LABEL: loopclearence: +; SSE: # %bb.0: # %entry +; SSE-NEXT: subq $136, %rsp +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movq (%rcx), %rax +; SSE-NEXT: movl $1, %r8d +; SSE-NEXT: .p2align 4, 0x90 +; SSE-NEXT: .LBB12_1: # %loop +; SSE-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-NEXT: xorps %xmm4, %xmm4 +; SSE-NEXT: cvtsi2sdq %r8, %xmm4 +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: addsd (%rdx), %xmm4 +; SSE-NEXT: cvttsd2si %xmm4, %rcx +; SSE-NEXT: addq %rcx, %rax +; SSE-NEXT: incq %r8 +; SSE-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; SSE-NEXT: jne .LBB12_1 +; SSE-NEXT: # %bb.2: # %ret +; SSE-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: addq $136, %rsp +; SSE-NEXT: retq +; +; AVX-LABEL: loopclearence: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $136, %rsp +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX-NEXT: movq (%rcx), %rax +; AVX-NEXT: movl $1, %r8d +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB12_1: # %loop +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: vcvtsi2sdq %r8, %xmm5, %xmm4 +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vaddsd (%rdx), %xmm4, %xmm0 +; AVX-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: addq $1, %r8 +; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX-NEXT: jne .LBB12_1 +; AVX-NEXT: # %bb.2: # %ret +; AVX-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $136, %rsp +; AVX-NEXT: retq entry: %vx = load i64, i64* %x br label %loop @@ -288,11 +1071,7 @@ br i1 %exitcond, label %ret, label %loop ret: ret i64 %s2 -;AVX-LABEL:@loopclearence ;Registers 4-7 are not used and therefore one of them should be chosen -;AVX-NOT: {{%xmm[4-7]}} -;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} -;AVX-NOT: [[XMM4_7]] } ; Make sure we are making a smart choice regarding undef registers even for more @@ -300,6 +1079,175 @@ ; julia> a = falses(10000); a[1:4:end] = true ; julia> linspace(1.0,2.0,10000)[a] define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) { +; SSE-LABEL: loopclearance2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: subq $152, %rsp +; SSE-NEXT: .seh_stackalloc 152 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 15, 128 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 14, 112 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 13, 96 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 12, 80 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 11, 64 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 10, 48 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 9, 32 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 8, 16 +; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: .seh_savexmm 7, 0 +; SSE-NEXT: .seh_endprologue +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: #APP +; SSE-NEXT: #NO_APP +; SSE-NEXT: movl $1, %r9d +; SSE-NEXT: xorl %r10d, %r10d +; SSE-NEXT: .p2align 4, 0x90 +; SSE-NEXT: .LBB13_1: # %inner_loop +; SSE-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-NEXT: movq %r10, %rax +; SSE-NEXT: shrq $6, %rax +; SSE-NEXT: movq (%rdx,%rax,8), %rax +; SSE-NEXT: btq %r10, %rax +; SSE-NEXT: leaq 1(%r10), %r10 +; SSE-NEXT: jae .LBB13_1 +; SSE-NEXT: # %bb.2: # %loop_end +; SSE-NEXT: # in Loop: Header=BB13_1 Depth=1 +; SSE-NEXT: leaq 1(%r9), %r11 +; SSE-NEXT: xorps %xmm4, %xmm4 +; SSE-NEXT: cvtsi2sdq %r11, %xmm4 +; SSE-NEXT: movapd %xmm2, %xmm5 +; SSE-NEXT: subsd %xmm4, %xmm5 +; SSE-NEXT: mulsd %xmm3, %xmm5 +; SSE-NEXT: leaq -1(%r10), %rax +; SSE-NEXT: xorps %xmm4, %xmm4 +; SSE-NEXT: cvtsi2sdq %rax, %xmm4 +; SSE-NEXT: mulsd %xmm1, %xmm4 +; SSE-NEXT: addsd %xmm5, %xmm4 +; SSE-NEXT: divsd %xmm0, %xmm4 +; SSE-NEXT: movsd %xmm4, -8(%rcx,%r9,8) +; SSE-NEXT: movq %r11, %r9 +; SSE-NEXT: cmpq %r11, %r8 +; SSE-NEXT: jge .LBB13_1 +; SSE-NEXT: # %bb.3: # %loopdone +; SSE-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: addq $152, %rsp +; SSE-NEXT: retq +; SSE-NEXT: .seh_handlerdata +; SSE-NEXT: .text +; SSE-NEXT: .seh_endproc +; +; AVX-LABEL: loopclearance2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rsi +; AVX-NEXT: .seh_pushreg 6 +; AVX-NEXT: subq $144, %rsp +; AVX-NEXT: .seh_stackalloc 144 +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 15, 128 +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 14, 112 +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 13, 96 +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 12, 80 +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 11, 64 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 10, 48 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 9, 32 +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 8, 16 +; AVX-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX-NEXT: .seh_savexmm 7, 0 +; AVX-NEXT: .seh_endprologue +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: movl $1, %r9d +; AVX-NEXT: xorl %r11d, %r11d +; AVX-NEXT: movl $14854, %r10d # imm = 0x3A06 +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB13_1: # %inner_loop +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: bextrq %r10, %r11, %rax +; AVX-NEXT: movq (%rdx,%rax,8), %rax +; AVX-NEXT: btq %r11, %rax +; AVX-NEXT: leaq 1(%r11), %r11 +; AVX-NEXT: jae .LBB13_1 +; AVX-NEXT: # %bb.2: # %loop_end +; AVX-NEXT: # in Loop: Header=BB13_1 Depth=1 +; AVX-NEXT: leaq 1(%r9), %rax +; AVX-NEXT: vcvtsi2sdq %rax, %xmm6, %xmm4 +; AVX-NEXT: vsubsd %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vmulsd %xmm3, %xmm4, %xmm4 +; AVX-NEXT: leaq -1(%r11), %rsi +; AVX-NEXT: vcvtsi2sdq %rsi, %xmm6, %xmm5 +; AVX-NEXT: vmulsd %xmm1, %xmm5, %xmm5 +; AVX-NEXT: vaddsd %xmm5, %xmm4, %xmm4 +; AVX-NEXT: vdivsd %xmm0, %xmm4, %xmm4 +; AVX-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8) +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: cmpq %rax, %r8 +; AVX-NEXT: jge .LBB13_1 +; AVX-NEXT: # %bb.3: # %loopdone +; AVX-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $144, %rsp +; AVX-NEXT: popq %rsi +; AVX-NEXT: retq +; AVX-NEXT: .seh_handlerdata +; AVX-NEXT: .text +; AVX-NEXT: .seh_endproc entry: tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() @@ -334,14 +1282,9 @@ ; Register use, plus us clobbering 7-15 above, basically forces xmm6 here as ; the only reasonable choice. The primary thing we care about is that it's ; not one of the registers used in the loop (e.g. not the output reg here) -;AVX-NOT: %xmm6 -;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} -;AVX-NOT: %xmm6 %nexti_f = sitofp i64 %nexti to double %sub = fsub double %c1, %nexti_f %mul = fmul double %sub, %c2 -;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} -;AVX-NOT: %xmm6 %phi_f = sitofp i64 %phi to double %mul2 = fmul double %phi_f, %c3 %add2 = fadd double %mul, %mul2 Index: test/CodeGen/X86/extract-bits.ll =================================================================== --- test/CodeGen/X86/extract-bits.ll +++ test/CodeGen/X86/extract-bits.ll @@ -5614,23 +5614,69 @@ ; ---------------------------------------------------------------------------- ; define void @pr38938(i32* %a0, i64* %a1) { -; X86-LABEL: pr38938: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: shrl $19, %ecx -; X86-NEXT: andl $4092, %ecx # imm = 0xFFC -; X86-NEXT: incl (%eax,%ecx) -; X86-NEXT: retl +; X86-NOBMI-LABEL: pr38938: +; X86-NOBMI: # %bb.0: +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movl (%ecx), %ecx +; X86-NOBMI-NEXT: shrl $19, %ecx +; X86-NOBMI-NEXT: andl $4092, %ecx # imm = 0xFFC +; X86-NOBMI-NEXT: incl (%eax,%ecx) +; X86-NOBMI-NEXT: retl ; -; X64-LABEL: pr38938: -; X64: # %bb.0: -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: shrq $19, %rax -; X64-NEXT: andl $4092, %eax # imm = 0xFFC -; X64-NEXT: incl (%rdi,%rax) -; X64-NEXT: retq +; X86-BMI1NOTBM-LABEL: pr38938: +; X86-BMI1NOTBM: # %bb.0: +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movl $2581, %edx # imm = 0xA15 +; X86-BMI1NOTBM-NEXT: bextrl %edx, (%ecx), %ecx +; X86-BMI1NOTBM-NEXT: incl (%eax,%ecx,4) +; X86-BMI1NOTBM-NEXT: retl +; +; X86-BMI1TBM-LABEL: pr38938: +; X86-BMI1TBM: # %bb.0: +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1TBM-NEXT: bextrl $2581, (%ecx), %ecx # imm = 0xA15 +; X86-BMI1TBM-NEXT: incl (%eax,%ecx,4) +; X86-BMI1TBM-NEXT: retl +; +; X86-BMI1NOTBMBMI2-LABEL: pr38938: +; X86-BMI1NOTBMBMI2: # %bb.0: +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBMBMI2-NEXT: movl $2581, %edx # imm = 0xA15 +; X86-BMI1NOTBMBMI2-NEXT: bextrl %edx, (%ecx), %ecx +; X86-BMI1NOTBMBMI2-NEXT: incl (%eax,%ecx,4) +; X86-BMI1NOTBMBMI2-NEXT: retl +; +; X64-NOBMI-LABEL: pr38938: +; X64-NOBMI: # %bb.0: +; X64-NOBMI-NEXT: movq (%rsi), %rax +; X64-NOBMI-NEXT: shrq $19, %rax +; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC +; X64-NOBMI-NEXT: incl (%rdi,%rax) +; X64-NOBMI-NEXT: retq +; +; X64-BMI1NOTBM-LABEL: pr38938: +; X64-BMI1NOTBM: # %bb.0: +; X64-BMI1NOTBM-NEXT: movl $2581, %eax # imm = 0xA15 +; X64-BMI1NOTBM-NEXT: bextrl %eax, (%rsi), %eax +; X64-BMI1NOTBM-NEXT: incl (%rdi,%rax,4) +; X64-BMI1NOTBM-NEXT: retq +; +; X64-BMI1TBM-LABEL: pr38938: +; X64-BMI1TBM: # %bb.0: +; X64-BMI1TBM-NEXT: bextrl $2581, (%rsi), %eax # imm = 0xA15 +; X64-BMI1TBM-NEXT: incl (%rdi,%rax,4) +; X64-BMI1TBM-NEXT: retq +; +; X64-BMI1NOTBMBMI2-LABEL: pr38938: +; X64-BMI1NOTBMBMI2: # %bb.0: +; X64-BMI1NOTBMBMI2-NEXT: movl $2581, %eax # imm = 0xA15 +; X64-BMI1NOTBMBMI2-NEXT: bextrl %eax, (%rsi), %eax +; X64-BMI1NOTBMBMI2-NEXT: incl (%rdi,%rax,4) +; X64-BMI1NOTBMBMI2-NEXT: retq %tmp = load i64, i64* %a1, align 8 %tmp1 = lshr i64 %tmp, 21 %tmp2 = and i64 %tmp1, 1023 Index: test/CodeGen/X86/selectcc-to-shiftand.ll =================================================================== --- test/CodeGen/X86/selectcc-to-shiftand.ll +++ test/CodeGen/X86/selectcc-to-shiftand.ll @@ -35,9 +35,9 @@ ; ; CHECK-BMI-LABEL: neg_sel_special_constant: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %edi, %eax -; CHECK-BMI-NEXT: shrl $22, %eax -; CHECK-BMI-NEXT: andl $512, %eax # imm = 0x200 +; CHECK-BMI-NEXT: movl $287, %eax # imm = 0x11F +; CHECK-BMI-NEXT: bextrl %eax, %edi, %eax +; CHECK-BMI-NEXT: shll $9, %eax ; CHECK-BMI-NEXT: retq %tmp.1 = icmp slt i32 %a, 0 %retval = select i1 %tmp.1, i32 512, i32 0 @@ -124,10 +124,10 @@ ; ; CHECK-BMI-LABEL: pos_sel_special_constant: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %edi, %eax -; CHECK-BMI-NEXT: notl %eax -; CHECK-BMI-NEXT: shrl $22, %eax -; CHECK-BMI-NEXT: andl $512, %eax # imm = 0x200 +; CHECK-BMI-NEXT: notl %edi +; CHECK-BMI-NEXT: movl $287, %eax # imm = 0x11F +; CHECK-BMI-NEXT: bextrl %eax, %edi, %eax +; CHECK-BMI-NEXT: shll $9, %eax ; CHECK-BMI-NEXT: retq %tmp.1 = icmp sgt i32 %a, -1 %retval = select i1 %tmp.1, i32 512, i32 0