Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3842,9 +3842,16 @@ EVT ExtVT; if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) { - // Only add this load if we can make it more narrow. - if (ExtVT.bitsLT(Load->getMemoryVT())) + + // ZEXTLOAD is already small enough. + if (Load->getExtensionType() == ISD::ZEXTLOAD && + ExtVT.bitsGE(Load->getMemoryVT())) + continue; + + // Use LE to convert equal sized loads to zext. + if (ExtVT.bitsLE(Load->getMemoryVT())) Loads.insert(Load); + continue; } return false; @@ -3899,11 +3906,13 @@ if (Loads.size() == 0) return false; + DEBUG(dbgs() << "Backwards propagate AND: "; N->dump()); SDValue MaskOp = N->getOperand(1); // If it exists, fixup the single node we allow in the tree that needs // masking. if (FixupNode) { + DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump()); SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), FixupNode->getValueType(0), SDValue(FixupNode, 0), MaskOp); @@ -3922,6 +3931,7 @@ // Create narrow loads. for (auto *Load : Loads) { + DEBUG(dbgs() << "Propagate AND back to: "; Load->dump()); SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), SDValue(Load, 0), MaskOp); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); Index: test/CodeGen/ARM/and-load-combine.ll =================================================================== --- test/CodeGen/ARM/and-load-combine.ll +++ test/CodeGen/ARM/and-load-combine.ll @@ -852,8 +852,7 @@ ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: ldrb r0, [r0] ; ARM-NEXT: uxtb r2, r2 -; ARM-NEXT: and r0, r0, r1 -; ARM-NEXT: uxtb r1, r0 +; ARM-NEXT: and r1, r0, r1 ; ARM-NEXT: mov r0, #0 ; ARM-NEXT: cmp r1, r2 ; ARM-NEXT: movweq r0, #1 @@ -863,8 +862,7 @@ ; ARMEB: @ %bb.0: @ %entry ; ARMEB-NEXT: ldrb r0, [r0] ; ARMEB-NEXT: uxtb r2, r2 -; ARMEB-NEXT: and r0, r0, r1 -; ARMEB-NEXT: uxtb r1, r0 +; ARMEB-NEXT: and r1, r0, r1 ; ARMEB-NEXT: mov r0, #0 ; ARMEB-NEXT: cmp r1, r2 ; ARMEB-NEXT: movweq r0, #1 @@ -872,9 +870,8 @@ ; ; THUMB1-LABEL: test6: ; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrb r0, [r0] -; THUMB1-NEXT: ands r0, r1 -; THUMB1-NEXT: uxtb r3, r0 +; THUMB1-NEXT: ldrb r3, [r0] +; THUMB1-NEXT: ands r3, r1 ; THUMB1-NEXT: uxtb r2, r2 ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: movs r1, #0 @@ -889,8 +886,7 @@ ; THUMB2: @ %bb.0: @ %entry ; THUMB2-NEXT: ldrb r0, [r0] ; THUMB2-NEXT: uxtb r2, r2 -; THUMB2-NEXT: ands r0, r1 -; THUMB2-NEXT: uxtb r1, r0 +; THUMB2-NEXT: ands r1, r0 ; THUMB2-NEXT: movs r0, #0 ; THUMB2-NEXT: cmp r1, r2 ; THUMB2-NEXT: it eq Index: test/CodeGen/X86/pr35763.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/pr35763.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s + +%struct.S = type <{ i16, i24, [5 x i8], i8, i16, [2 x i8] }> + +@z = global { i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] } { i16 -724, i8 94, i8 -18, i8 5, i8 undef, i8 96, i8 104, i8 -24, i8 10, i8 0, [5 x i8] undef }, align 8 +@tf_3_var_136 = global i64 0, align 8 +@.str = private unnamed_addr constant [6 x i8] c"%llu\0A\00", align 1 + +define void @PR35763() { +; CHECK-LABEL: PR35763: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzwl {{.*}}(%rip), %eax +; CHECK-NEXT: movzwl z+{{.*}}(%rip), %ecx +; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: movq %rcx, {{.*}}(%rip) +; CHECK-NEXT: movl z+{{.*}}(%rip), %eax +; CHECK-NEXT: movzbl z+{{.*}}(%rip), %ecx +; CHECK-NEXT: shlq $32, %rcx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: movabsq $1090921758719, %rax # imm = 0xFE0000FFFF +; CHECK-NEXT: andq %rcx, %rax +; CHECK-NEXT: movl %eax, z+{{.*}}(%rip) +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: movb %al, z+{{.*}}(%rip) +; CHECK-NEXT: retq +entry: + %0 = load i16, i16* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 0), align 8 + %conv = sext i16 %0 to i32 + %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 1) to i32*), align 2 + %bf.clear = and i32 %bf.load, 2097151 + %bf.cast = zext i32 %bf.clear to i64 + %conv1 = trunc i64 %bf.cast to i32 + %or = or i32 %conv, %conv1 + %conv2 = trunc i32 %or to i16 + %conv3 = zext i16 %conv2 to i64 + store i64 %conv3, i64* @tf_3_var_136, align 8 + %bf.load4 = load i40, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 2) to i40*), align 2 + %bf.clear5 = and i40 %bf.load4, -8589869057 + store i40 %bf.clear5, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 2) to i40*), align 2 + ret void +}