Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4304,6 +4304,9 @@ (N0.getOpcode() == ISD::ANY_EXTEND && N0.getOperand(0).getOpcode() == ISD::LOAD))) { if (SDValue Res = ReduceLoadWidth(N)) { + if (Res.getOpcode() == ISD::SHL) + return Res; + LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND ? cast(N0.getOperand(0)) : cast(N0); @@ -8612,6 +8615,9 @@ if (VT.isVector()) return SDValue(); + unsigned ShAmt = 0; + unsigned ShLeftAmt = 0; + // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then // extended to VT. if (Opc == ISD::SIGN_EXTEND_INREG) { @@ -8639,15 +8645,68 @@ } else if (Opc == ISD::AND) { // An AND with a constant mask is the same as a truncate + zero-extend. auto AndC = dyn_cast(N->getOperand(1)); - if (!AndC || !AndC->getAPIntValue().isMask()) + if (!AndC) + return SDValue(); + + // TODO: Not only [shifted] masks should be accepted. + // For example: AND (LD i16, [M]), 0x00AB) + // can be replaced by AND (ZEXT(LD i8, [M]) to i16), 0x00AB). + const APInt &MaskAPInt = AndC->getAPIntValue(); + if (!MaskAPInt.isShiftedMask()) + return SDValue(); + + unsigned MaxBit = MaskAPInt.getBitWidth() - MaskAPInt.countLeadingZeros(); + const unsigned MinBit = MaskAPInt.countTrailingZeros(); + // Only accepts multiples of 8 bits, and power of 2 sizes + if (!MaxBit && (0 != (MaxBit | MinBit) % 8)) + return SDValue(); + + unsigned ActiveBits = MaxBit - MinBit; + if (ActiveBits & (ActiveBits - 1)) return SDValue(); - unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); + // LLVM_DEBUG(dbgs() << "\tMask: 0x" << MaskAPInt.toString(16, false) << + // " : "; + // AndC->dump(); + // dbgs() << "\t\tmaxActiveBit: " << MaxBit - 1 + // << "\n\t\tminActiveBit: " << MinBit << '\n'); + + LoadSDNode *LN0 = dyn_cast(N0); ExtType = ISD::ZEXTLOAD; + if (MinBit != 0) { + // How to treat if it was not a load? + if (LN0 == nullptr) + return SDValue(); + + const auto &mvt = LN0->getMemoryVT(); + if (MinBit >= mvt.getSizeInBits()) { + // The (and) is filtering what was extended, not the actual data + // value... + if (ISD::LoadExtType::ZEXTLOAD == LN0->getExtensionType()) { + // We only read the zero values + return DAG.getConstant(0, SDLoc(N), AndC->getValueType(0)); + } + // We access the sign extension, not known here + return SDValue(); + } + if (MaxBit > mvt.getSizeInBits()) + ExtType = LN0->getExtensionType(); + // LLVM_DEBUG(dbgs() << "\tCan replace load: "; LN0->dump(); + // dbgs() << "\tBy a load of width " << ActiveBits / 8 + // << " bytes, and with offset of " << MinBit / 8 << + // " bytes\n"); + } + // TODO: Accept SEXT if the architecture accepts doing a LD?SH (load + shl) + // An (and (ld.32bit.sext.from16 [M]), 0x00FFFF00) can be replaced by + // (and (shl (ld.32bit.sext.from8 [M+1]), 8), 0x00FFFF00) + if (ExtType != ISD::ZEXTLOAD) + return SDValue(); + ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); + ShAmt = MinBit; + ShLeftAmt = MinBit; } - unsigned ShAmt = 0; if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { SDValue SRL = N0; if (auto *ConstShift = dyn_cast(SRL.getOperand(1))) { @@ -8698,7 +8757,6 @@ // If the load is shifted left (and the result isn't shifted back right), // we can fold the truncate through the shift. - unsigned ShLeftAmt = 0; if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { if (ConstantSDNode *N01 = dyn_cast(N0.getOperand(1))) { Index: test/CodeGen/ARM/2018-ShiftedAndMask.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/2018-ShiftedAndMask.ll @@ -0,0 +1,69 @@ +; RUN: llc -O3 -march=arm -o - %s | FileCheck %s -check-prefix=ARM +; RUN: llc -O3 -march=armeb -o - %s | FileCheck %s -check-prefix=ARMEB +define void @ldb1(i32* %A) { +entry: + %0 = load i32, i32* %A, align 4 + %1 = and i32 %0, 65280 + store i32 %1, i32* %A, align 4 + ret void +} +; ARM-LABEL: ldb1 +; ARM: ldrb [[R0:r[0-9]+]], {{\[}}[[R1:r[0-9]+]], #1{{\]}} +; ARM: lsl [[R2:r[0-9]+]], [[R0]], #8 +; ARM: str [[R2]], {{\[}}[[R1]]{{\]}} + +; ARMEB-LABEL: ldb1 +; ARMEB: ldrb [[R0:r[0-9]+]], {{\[}}[[R1:r[0-9]+]], #2{{\]}} +; ARMEB: lsl [[R2:r[0-9]+]], [[R0]], #8 +; ARMEB: str [[R2]], {{\[}}[[R1]]{{\]}} + +define void @ldb2(i32* %A) { +entry: + %0 = load i32, i32* %A, align 4 + %1 = and i32 %0, 16711680 + store i32 %1, i32* %A, align 4 + ret void +} +; ARM-LABEL: ldb2 +; ARM: ldrb [[R0:r[0-9]+]], {{\[}}[[R1:r[0-9]+]], #2{{\]}} +; ARM: lsl [[R2:r[0-9]+]], [[R0]], #16 +; ARM: str [[R2]], {{\[}}[[R1]]{{\]}} + +; ARMEB-LABEL: ldb2 +; ARMEB: ldrb [[R0:r[0-9]+]], {{\[}}[[R1:r[0-9]+]], #1{{\]}} +; ARMEB: lsl [[R2:r[0-9]+]], [[R0]], #16 +; ARMEB: str [[R2]], {{\[}}[[R1]]{{\]}} + +define void @ldb3(i32* %A) { +entry: + %0 = load i32, i32* %A, align 4 + %1 = and i32 %0, 4278190080 + store i32 %1, i32* %A, align 4 + ret void +} +; ARM-LABEL: ldb3 +; ARM: ldrb [[R0:r[0-9]+]], {{\[}}[[R1:r[0-9]+]], #3{{\]}} +; ARM: lsl [[R2:r[0-9]+]], [[R0]], #24 +; ARM: str [[R2]], {{\[}}[[R1]]{{\]}} + +; ARMEB-LABEL: ldb3 +; ARMEB: ldrb [[R0:r[0-9]+]], {{\[}}[[R1:r[0-9]+]]{{\]}} +; ARMEB: lsl [[R2:r[0-9]+]], [[R0]], #24 +; ARMEB: str [[R2]], {{\[}}[[R1]]{{\]}} + +define void @ldh(i32* %A) { +entry: + %0 = load i32, i32* %A, align 4 + %1 = and i32 %0, 4294901760 + store i32 %1, i32* %A, align 4 + ret void +} +; ARM-LABEL: ldh +; ARM: ldrh [[R0:r[0-9]+]], {{\[}}[[R1:r[0-9]+]], #2{{\]}} +; ARM: lsl [[R2:r[0-9]+]], [[R0]], #16 +; ARM: str [[R2]], {{\[}}[[R1]]{{\]}} + +; ARMEB-LABEL: ldh +; ARMEB: ldrh [[R0:r[0-9]+]], {{\[}}[[R1:r[0-9]+]]{{\]}} +; ARMEB: lsl [[R2:r[0-9]+]], [[R0]], #16 +; ARMEB: str [[R2]], {{\[}}[[R1]]{{\]}} Index: test/CodeGen/X86/fp128-i128.ll =================================================================== --- test/CodeGen/X86/fp128-i128.ll +++ test/CodeGen/X86/fp128-i128.ll @@ -43,18 +43,18 @@ ; } define void @TestUnionLD1(fp128 %s, i64 %n) #0 { ; CHECK-LABEL: TestUnionLD1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF -; CHECK-NEXT: andq %rdi, %rcx -; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 -; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: jmp foo # TAILCALL +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movaps %xmm0, -24(%rsp) +; CHECK-NEXT: movzwl -10(%rsp), %eax +; CHECK-NEXT: shlq $48, %rax +; CHECK-NEXT: movq -24(%rsp), %rcx +; CHECK-NEXT: movabsq $281474976710655, %rdx # imm = 0xFFFFFFFFFFFF +; CHECK-NEXT: andq %rdi, %rdx +; CHECK-NEXT: orq %rax, %rdx +; CHECK-NEXT: movq %rcx, -40(%rsp) +; CHECK-NEXT: movq %rdx, -32(%rsp) +; CHECK-NEXT: movaps -40(%rsp), %xmm0 +; CHECK-NEXT: jmp foo # TAILCALL entry: %0 = bitcast fp128 %s to i128 %1 = zext i64 %n to i128