Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -278,6 +278,7 @@ /// target-specific DAG combines. SDValue combine(SDNode *N); + SDValue combine_AND_ShiftAND(SDNode *N, SDValue &N0, SDValue &N1); // Visitation implementation - Implement dag node combining for different // node types. The semantics are as follows: // Return Value: @@ -453,6 +454,9 @@ SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); + SDNode *ShrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *storeOp, + const SDNode *orOp); + SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); @@ -4005,6 +4009,85 @@ return false; } +// fold expressions x1 and x2 alike: +// x1 = ( and, x, 0x00FF ) +// x2 = (( shl x, 8 ) and 0xFF00 ) +// into +// x2 = shl x1, 8 ; reuse the computation of x1 +SDValue DAGCombiner::combine_AND_ShiftAND(SDNode *N, SDValue &N0, SDValue &N1) { + ConstantSDNode *mask = dyn_cast(N1); + if (!mask) + return SDValue(); + + if ((N0.getNumOperands() != 2) || (!N0.hasOneUse())) + return SDValue(); + + ConstantSDNode *shiftAmount = dyn_cast(N0.getOperand(1)); + if (!shiftAmount) + return SDValue(); + + const ISD::NodeType N0Opcode = (ISD::NodeType)N0.getOpcode(); + if (((N0Opcode < ISD::SHL) || (N0Opcode > ISD::ROTR)) && + ((N0Opcode < ISD::SHL_PARTS) || (N0Opcode > ISD::SRL_PARTS))) + return SDValue(); + + const auto &maskedValue = dyn_cast(N0.getOperand(0)); + for (SDNode *otherUser : maskedValue->uses()) { + SDNode *shiftOperand = dyn_cast(N0); + if ((shiftOperand == nullptr) || (&(*otherUser) == shiftOperand) or + (otherUser->getOpcode() != ISD::AND)) + continue; + + ConstantSDNode *otherMask = + dyn_cast(otherUser->getOperand(1)); + if (!otherMask) + continue; + + bool canReduce = false; + + const APInt &maskValue = mask->getAPIntValue(); + const APInt &shiftValue = shiftAmount->getAPIntValue(); + const APInt &otherMaskValue = otherMask->getAPIntValue(); + switch (N0Opcode) { + case ISD::SHL: + canReduce = (maskValue.lshr(shiftValue) == otherMaskValue); + break; + case ISD::SRA: + case ISD::SRL: + canReduce = (maskValue.shl(shiftValue) == otherMaskValue); + break; + case ISD::ROTL: + canReduce = (maskValue.rotr(shiftValue) == otherMaskValue); + break; + case ISD::ROTR: + canReduce = (maskValue.rotl(shiftValue) == otherMaskValue); + break; + case ISD::SHL_PARTS: + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: + DEBUG(dbgs() << "Todo\n"); + break; + default: + llvm_unreachable("This opcode is not accepted!"); + break; + } + if (canReduce) { + DEBUG(dbgs() << " with: "; N0.getNode()->dump(); + dbgs() << " and : "; otherUser->dump();); + + SDValue shiftTheAND(otherUser, 0); + const SDLoc DL(N0); + EVT VT = N->getValueType(0); + SDValue newShift = + DAG.getNode(N0Opcode, DL, VT, shiftTheAND, N0.getOperand(1)); + AddToWorklist(maskedValue); + AddToWorklist(otherUser); + return newShift; + } + } + return SDValue(); +} + SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4205,6 +4288,9 @@ (N0.getOpcode() == ISD::ANY_EXTEND && N0.getOperand(0).getOpcode() == ISD::LOAD))) { if (SDValue Res = ReduceLoadWidth(N)) { + if (Res.getOpcode() == ISD::SHL) + return Res; + LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND ? cast(N0.getOperand(0)) : cast(N0); @@ -4214,6 +4300,9 @@ } } + if (SDValue r = combine_AND_ShiftAND(N, N0, N1)) + return r; + if (Level >= AfterLegalizeTypes) { // Attempt to propagate the AND back up to the leaves which, if they're // loads, can be combined to narrow loads and the AND node can be removed. @@ -6262,13 +6351,36 @@ } // fold (srl (shl x, c), c) -> (and x, cst2) - if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && - isConstantOrConstantVector(N1, /* NoOpaques */ true)) { - SDLoc DL(N); - SDValue Mask = - DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); - AddToWorklist(Mask.getNode()); - return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); + if ((N0.getOpcode() == ISD::SHL) && + (isConstantOrConstantVector(N1, /* NoOpaques */ true))) { + bool canFold = N0.getOperand(1) == N1; + if (!canFold) { + const ConstantSDNode *CN0N1 = dyn_cast(N0.getOperand(1)); + if (CN0N1 && N1C) + canFold = CN0N1->getZExtValue() == N1C->getZExtValue(); + } + + if (canFold) { + // fold (srl (shl x, c), c) -> (c) if the x upper bits of c are known to + // be 0 + // TODO: Add more instructions that produce known upper bits zero masks, + // other than zext loads + if (N1C) { + if (LoadSDNode *x = dyn_cast(N0.getOperand(0))) { + const unsigned xSize = x->getValueSizeInBits(0); + const unsigned xMemSize = x->getMemOperand()->getSize() * 8; + if ((xSize > xMemSize) && + ((xSize - xMemSize) >= N1C->getZExtValue()) && + (x->getExtensionType() == ISD::LoadExtType::ZEXTLOAD)) + return N0.getOperand(0); + } + } + SDLoc DL(N); + SDValue Mask = + DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); + AddToWorklist(Mask.getNode()); + return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); + } } // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask) @@ -8504,6 +8616,9 @@ if (VT.isVector()) return SDValue(); + unsigned ShAmt = 0; + unsigned ShLeftAmt = 0; + // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then // extended to VT. if (Opc == ISD::SIGN_EXTEND_INREG) { @@ -8531,15 +8646,58 @@ } else if (Opc == ISD::AND) { // An AND with a constant mask is the same as a truncate + zero-extend. auto AndC = dyn_cast(N->getOperand(1)); - if (!AndC || !AndC->getAPIntValue().isMask()) + const APInt &maskAPInt = AndC->getAPIntValue(); + // TODO: Not only [shifted] masks should be accepted. + //(and ld.16 [M], 0x00AB) can be replaced by (and ld.8.zext16 [M], 0x00AB). + if (!AndC || !(maskAPInt.isMask() || maskAPInt.isShiftedMask())) + return SDValue(); + + unsigned maxBit = maskAPInt.getBitWidth() - maskAPInt.countLeadingZeros(); + const unsigned minBit = maskAPInt.countTrailingZeros(); + // Only accepts multiples of 8 bits, and power of 2 sizes + if ((maxBit | minBit) % 8) return SDValue(); - unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); + unsigned ActiveBits = maxBit - minBit; + if (ActiveBits & (ActiveBits - 1)) + return SDValue(); + + DEBUG(dbgs() << "\n\tMask: "; AndC->dump(); + dbgs() << "\n\t\tmaxActiveBit: " << maxBit - 1 + << "\n\t\tminActiveBit: " << minBit << '\n'); + + LoadSDNode *LN0 = dyn_cast(N0); ExtType = ISD::ZEXTLOAD; + if (minBit != 0) { + // How to treat if it was not a load? + if (LN0 == nullptr) + return SDValue(); + + const auto &mvt = LN0->getMemoryVT(); + if (minBit >= mvt.getSizeInBits()) { + // The (and) is filtering what was extended, not the actual data + // value... + if (ISD::LoadExtType::ZEXTLOAD == LN0->getExtensionType()) { + // We only read the zero values + return DAG.getConstant(0, SDLoc(N), AndC->getValueType(0)); + } + // We access the sign extension, not known here + return SDValue(); + } + if (maxBit > mvt.getSizeInBits()) + ExtType = LN0->getExtensionType(); + } + // TODO: Accept SEXT if the architecture accepts doing a LD?SH (load + shl) + // An (and (ld.32bit.sext.from16 [M]), 0x00FFFF00) can be replaced by + // (and (shl (ld.32bit.sext.from8 [M+1]), 8), 0x00FFFF00) + if (ExtType != ISD::ZEXTLOAD) + return SDValue(); + ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); + ShAmt = minBit; + ShLeftAmt = minBit; } - unsigned ShAmt = 0; if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { SDValue SRL = N0; if (auto *ConstShift = dyn_cast(SRL.getOperand(1))) { @@ -8590,7 +8748,6 @@ // If the load is shifted left (and the result isn't shifted back right), // we can fold the truncate through the shift. - unsigned ShLeftAmt = 0; if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { if (ConstantSDNode *N01 = dyn_cast(N0.getOperand(1))) { @@ -12831,6 +12988,85 @@ St->getPointerInfo().getWithOffset(StOffset), NewAlign) .getNode(); } +/// Detects operations such as M[i] = M[i] | M[i] << K, or +/// M[i] = M[i] | M[i] >> K, +/// where K is half size of the store width. If we can prove it only copies +/// bytes from one position to another, it might be possible to reduce the widht +/// of the store and remove the 'or + shift' operations. +SDNode * +DAGCombiner::ShrinkLoadShiftOrStoreWithLoadNewStore(StoreSDNode *storeOp, + const SDNode *orOp) { + + SDValue loadSD = orOp->getOperand(0); + LoadSDNode *load = dyn_cast(loadSD); + SDNode *other = orOp->getOperand(1).getNode(); + if (!load) { + loadSD = orOp->getOperand(1); + load = dyn_cast(loadSD); + other = orOp->getOperand(0).getNode(); + } + + if (!load) + return nullptr; + + unsigned shiftedBytes = 0; + if ((other->getOpcode() == ISD::SHL) && + (other->getOperand(0).getNode() == load) && + isa(other->getOperand(1))) + shiftedBytes = cast(other->getOperand(1).getNode()) + ->getAPIntValue() + .getSExtValue() / + 8; + // TODO: Accept other shifting operations such as srl, sra. Can use a negative + // value for shiftedBytes + + unsigned storeMemSz = storeOp->getMemoryVT().getStoreSize(); + // For now we only accept chains that moves half of the loaded value to the + // other half. + if (2 * shiftedBytes != storeMemSz) + return nullptr; + + const SDValue loadPtr = load->getBasePtr(); + SDValue Ptr = storeOp->getBasePtr(); + // TODO: Detect when both LOAD and STORE memory addresses are ADD instructions + // but with known difference + bool samePtr = loadPtr == Ptr; + if (!samePtr && (loadPtr.getOpcode() != ISD::ADD)) + return nullptr; + + // Detect if we are moving M[A+k] to M[A]: + if (!((loadPtr.getOperand(0) == Ptr) || (loadPtr.getOperand(1) == Ptr))) + return nullptr; + + ConstantSDNode *offset = dyn_cast(loadPtr.getOperand(1)); + if (!offset) + offset = dyn_cast(loadPtr.getOperand(0)); + + if (!offset) + return nullptr; + + long loadByteOffset = offset->getAPIntValue().getSExtValue(); + if (loadByteOffset < + 0) // TODO: Accept negative offsets. How othen does it happen? + return nullptr; + + unsigned loadMemSz = load->getMemoryVT().getStoreSize(); + bool upperHalfLoad = + ((loadByteOffset == loadMemSz) && (2 * loadMemSz == storeMemSz)); + + if (!(upperHalfLoad || samePtr)) + return nullptr; + + if (samePtr) { + // TODO: Store the lower loaded value to the upper half + if ((loadMemSz == storeMemSz) && (load->use_size() == 2)) { + DEBUG(dbgs() << "Reduce load width\n"); + } + DEBUG(dbgs() << "Move lower to upper half\n"); + } + DEBUG(dbgs() << "Reduce store width to half width!!!!"); + return ShrinkLoadReplaceStoreWithStore({loadMemSz, 0}, loadSD, storeOp, this); +} /// Look for sequence of load / op / store where op is one of 'or', 'xor', and /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try @@ -12843,14 +13079,23 @@ SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); - SDValue Ptr = ST->getBasePtr(); + SDValue Ptr = ST->getBasePtr(); EVT VT = Value.getValueType(); - if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) + if (VT.isVector() || !Value.hasOneUse()) return SDValue(); unsigned Opc = Value.getOpcode(); + if (Opc == ISD::OR) { + if (SDNode *NewSt = + ShrinkLoadShiftOrStoreWithLoadNewStore(ST, Value.getNode())) + return SDValue(NewSt, 0); + } + + if (ST->isTruncatingStore()) + return SDValue(); + // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst // is a byte mask indicating a consecutive number of bytes, check to see if // Y is known to provide just those bytes. If so, we try to replace the Index: test/CodeGen/ARM/stld-width-reduction1.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/stld-width-reduction1.ll @@ -0,0 +1,33 @@ +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv4t-arm-none-eabi" +; RUN: llc -mtriple=arm %s -o - | FileCheck %s +; CHECK-LABEL: test_1x2: +; CHECK: %bb.0: +; CHECK: ldrh +; CHECK-NEXT: strb + +; Function Attrs: norecurse nounwind +define dso_local void @test_1x2(i16* nocapture, i32) local_unnamed_addr #0 { + %3 = getelementptr inbounds i16, i16* %0, i32 %1 + %4 = load i16, i16* %3, align 2, !tbaa !3 + %5 = and i16 %4, 255 + %6 = shl i16 %4, 8 + %7 = or i16 %5, %6 + store i16 %7, i16* %3, align 2, !tbaa !3 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+armv4t,+strict-align,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{!"clang version 7.0.0 (trunk 331513)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"short", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !5, i64 0} Index: test/CodeGen/ARM/stld-width-reduction2.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/stld-width-reduction2.ll @@ -0,0 +1,34 @@ +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv4t-arm-none-eabi" + +; RUN: llc -mtriple=arm %s -o - | FileCheck %s +; CHECK-LABEL: test_1x2p1: +; CHECK: %bb.0: +; CHECK: ldrb +; CHECK-NEXT: strb + +; Function Attrs: norecurse nounwind +define dso_local void @test_1x2p1(i16* nocapture, i32) local_unnamed_addr #0 { + %3 = getelementptr inbounds i16, i16* %0, i32 %1 + %4 = load i16, i16* %3, align 2, !tbaa !3 + %5 = and i16 %4, -256 + %6 = lshr i16 %4, 8 + %7 = or i16 %5, %6 + store i16 %7, i16* %3, align 2, !tbaa !3 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+armv4t,+strict-align,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{!"clang version 7.0.0 (trunk 331513)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"short", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !5, i64 0} Index: test/CodeGen/ARM/stld-width-reduction3.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/stld-width-reduction3.ll @@ -0,0 +1,35 @@ +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv4t-arm-none-eabi" + +; RUN: llc -mtriple=arm %s -o - | FileCheck %s +; CHECK-LABEL: test_1x4p1: +; CHECK: ldrb +; CHECK-NEXT: orr +; CHECK-NEXT: str + +; Function Attrs: norecurse nounwind +define dso_local void @test_1x4p1(i32* nocapture, i32) local_unnamed_addr #0 { + %3 = getelementptr inbounds i32, i32* %0, i32 %1 + %4 = load i32, i32* %3, align 4, !tbaa !7 + %5 = and i32 %4, 65280 + %6 = lshr i32 %4, 8 + %7 = and i32 %6, 255 + %8 = or i32 %7, %5 + store i32 %8, i32* %3, align 4, !tbaa !7 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+armv4t,+strict-align,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{!"clang version 7.0.0 (trunk 331513)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"short", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !5, i64 0} Index: test/CodeGen/ARM/stld-width-reduction4.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/stld-width-reduction4.ll @@ -0,0 +1,36 @@ +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv4t-arm-none-eabi" + +; RUN: llc -mtriple=arm %s -o - | FileCheck %s +; CHECK-LABEL: test_1x4p1_shl: +; CHECK: ldrb +; CHECK-NEXT: lsl +; CHECK-NEXT: orr +; CHECK-NEXT: str + +; Function Attrs: norecurse nounwind +define dso_local void @test_1x4p1_shl(i32* nocapture, i32) local_unnamed_addr #0 { + %3 = getelementptr inbounds i32, i32* %0, i32 %1 + %4 = load i32, i32* %3, align 4, !tbaa !7 + %5 = and i32 %4, 65280 + %6 = shl i32 %4, 8 + %7 = and i32 %6, 16711680 + %8 = or i32 %7, %5 + store i32 %8, i32* %3, align 4, !tbaa !7 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+armv4t,+strict-align,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"min_enum_size", i32 4} +!2 = !{!"clang version 7.0.0 (trunk 331513)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"short", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !5, i64 0} Index: test/CodeGen/X86/fp128-i128.ll =================================================================== --- test/CodeGen/X86/fp128-i128.ll +++ test/CodeGen/X86/fp128-i128.ll @@ -45,13 +45,13 @@ ; CHECK-LABEL: TestUnionLD1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF -; CHECK-NEXT: andq %rdi, %rcx -; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 -; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: shlq $48, %rax +; CHECK-NEXT: movq -24(%rsp), %rcx +; CHECK-NEXT: movabsq $281474976710655, %rdx # imm = 0xFFFFFFFFFFFF +; CHECK-NEXT: andq %rdi, %rdx +; CHECK-NEXT: orq %rax, %rdx +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: jmp foo # TAILCALL Index: test/CodeGen/X86/pr32329.ll =================================================================== --- test/CodeGen/X86/pr32329.ll +++ test/CodeGen/X86/pr32329.ll @@ -29,67 +29,67 @@ ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl obj, %edx -; X86-NEXT: movsbl var_27, %eax -; X86-NEXT: movzwl var_2, %esi -; X86-NEXT: movl var_310, %ecx -; X86-NEXT: imull %eax, %ecx -; X86-NEXT: addl var_24, %ecx -; X86-NEXT: andl $4194303, %edx # imm = 0x3FFFFF -; X86-NEXT: leal (%edx,%edx), %ebx -; X86-NEXT: subl %eax, %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: subl %esi, %edi -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 -; X86-NEXT: movl $9, %esi -; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: shldl %cl, %esi, %ebp -; X86-NEXT: shll %cl, %esi -; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %esi, %ebp -; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovnel %ecx, %esi -; X86-NEXT: cmpl %edx, %edi -; X86-NEXT: movl %ebp, var_50+4 -; X86-NEXT: movl %esi, var_50 -; X86-NEXT: setge var_205 -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: movb %bl, var_218 -; X86-NEXT: popl %esi +; X86-NEXT: movsbl var_27, %eax +; X86-NEXT: movzwl var_2, %esi +; X86-NEXT: movl var_310, %ecx +; X86-NEXT: imull %eax, %ecx +; X86-NEXT: addl var_24, %ecx +; X86-NEXT: movl $4194303, %edi # imm = 0x3FFFFF +; X86-NEXT: andl obj, %edi +; X86-NEXT: leal (%edi,%edi), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 +; X86-NEXT: movl $9, %esi +; X86-NEXT: xorl %ebp, %ebp +; X86-NEXT: shldl %cl, %esi, %ebp +; X86-NEXT: shll %cl, %esi +; X86-NEXT: testb $32, %cl +; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: movl $0, %ecx +; X86-NEXT: cmovnel %ecx, %esi +; X86-NEXT: cmpl %edi, %ebx +; X86-NEXT: movl %ebp, var_50+4 +; X86-NEXT: movl %esi, var_50 +; X86-NEXT: setge var_205 +; X86-NEXT: imull %eax, %edx +; X86-NEXT: movb %dl, var_218 +; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: popl %edi +; X86-NEXT: popl %edi ; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: popl %ebp +; X86-NEXT: popl %ebp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: foo: ; X64: # %bb.0: # %entry -; X64-NEXT: movl {{.*}}(%rip), %eax -; X64-NEXT: movsbl {{.*}}(%rip), %r9d -; X64-NEXT: movzwl {{.*}}(%rip), %r8d -; X64-NEXT: movl {{.*}}(%rip), %ecx -; X64-NEXT: imull %r9d, %ecx -; X64-NEXT: addl {{.*}}(%rip), %ecx -; X64-NEXT: andl $4194303, %eax # imm = 0x3FFFFF -; X64-NEXT: leal (%rax,%rax), %edi -; X64-NEXT: subl %r9d, %edi -; X64-NEXT: movl %edi, %esi -; X64-NEXT: subl %r8d, %esi -; X64-NEXT: imull %esi, %ecx -; X64-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 -; X64-NEXT: movl $9, %edx -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shlq %cl, %rdx -; X64-NEXT: movq %rdx, {{.*}}(%rip) -; X64-NEXT: cmpl %eax, %esi -; X64-NEXT: setge {{.*}}(%rip) -; X64-NEXT: imull %r9d, %edi -; X64-NEXT: movb %dil, {{.*}}(%rip) -; X64-NEXT: retq +; X64-NEXT: movsbl var_27(%rip), %r9d +; X64-NEXT: movzwl var_2(%rip), %r8d +; X64-NEXT: movl var_310(%rip), %ecx +; X64-NEXT: imull %r9d, %ecx +; X64-NEXT: addl var_24(%rip), %ecx +; X64-NEXT: movl $4194303, %esi # imm = 0x3FFFFF +; X64-NEXT: andl obj(%rip), %esi +; X64-NEXT: leal (%rsi,%rsi), %edi +; X64-NEXT: subl %r9d, %edi +; X64-NEXT: movl %edi, %edx +; X64-NEXT: subl %r8d, %edx +; X64-NEXT: imull %edx, %ecx +; X64-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 +; X64-NEXT: movl $9, %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shlq %cl, %rax +; X64-NEXT: movq %rax, var_50(%rip) +; X64-NEXT: cmpl %esi, %edx +; X64-NEXT: setge var_205(%rip) +; X64-NEXT: imull %r9d, %edi +; X64-NEXT: movb %dil, var_218(%rip) +; X64-NEXT: retq entry: %bf.load = load i32, i32* bitcast (%struct.AA* @obj to i32*), align 8 %bf.clear = shl i32 %bf.load, 1 Index: test/CodeGen/X86/pr32588.ll =================================================================== --- test/CodeGen/X86/pr32588.ll +++ test/CodeGen/X86/pr32588.ll @@ -4,10 +4,10 @@ @b = external local_unnamed_addr global i32, align 4 @d = external local_unnamed_addr global i32, align 4 -; CHECK: cmpl $1, c(%rip) -; CHECK-NEXT: sbbl %eax, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: movl %eax, d(%rip) +; CHECK: xorl %eax, %eax +; CHECK-NEXT: cmpl $0, c(%rip) +; CHECK-NEXT: sete %al +; CHECK-NEXT: movl %eax, d(%rip) ; CHECK-NEXT: retq define void @fn1() {