Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -403,6 +403,7 @@ // These should be promoted to a larger select which is supported. setOperationAction(ISD::SELECT , MVT::i1 , Promote); + // X86 wants to expand cmov itself. setOperationAction(ISD::SELECT , MVT::i8 , Custom); setOperationAction(ISD::SELECT , MVT::i16 , Custom); @@ -3477,8 +3478,29 @@ return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); } -static bool MayFoldIntoStore(SDValue Op) { - return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); +// MayFoldIntoReadModifyWrite - Return true if this op might be able to +// be folded into a read/modify/write type of instruction. This can only +// happen when the load and the store have the same address expression. +// The load that is to be checked for address equality with the store +// is passed in. +// +static bool MayFoldIntoReadModifyWrite(SDValue Op, SDValue Load) { + if (!Op.hasOneUse()) + return false; + + if (!MayFoldLoad(Load)) + return false; + + SDNode *User = *(Op.getNode()->use_begin()); + + if (!ISD::isNormalStore(User)) + return false; + + // Check that the load and store use the same address. + LoadSDNode *LD = cast(Load); + StoreSDNode *ST = cast(User); + + return LD->getBasePtr() == ST->getBasePtr(); } static bool isTargetShuffle(unsigned Opcode) { @@ -23999,7 +24021,7 @@ case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). - if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) + if (MayFoldIntoReadModifyWrite(Op, N0)) return false; Promote = true; break; @@ -24014,14 +24036,55 @@ case ISD::SUB: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); - if (!Commute && MayFoldLoad(N1)) - return false; + // Avoid disabling potential load folding opportunities. - if (MayFoldLoad(N0) && (!isa(N1) || MayFoldIntoStore(Op))) - return false; - if (MayFoldLoad(N1) && (!isa(N0) || MayFoldIntoStore(Op))) - return false; + if (Commute && MayFoldLoad(N0)) { + // This is attempting to not disable read/modify/write + // combining for the operation. + if (MayFoldIntoReadModifyWrite(Op, N0)) + return false; + + // The load in N0 is only foldable if N1 has a single use, and is + // not a constant. Otherwise, to find a register to modify, it most + // likely is going to use the N0 load's register as the destination. + if (!isa(N1)) { + // Truncates are effectively not instructions, so check the + // truncate's operand, but only in the case where the TRUNCATE doesn't + // already have multiple uses. The point of this is that a single-use + // NOP type of instruction shouldn't act as if that value could be used + // as a modifiable reg once this goes to register allocation. + if (N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) + N1 = N1.getOperand(0); + + if (N1.hasOneUse()) + return false; + } + } + + if (MayFoldLoad(N1)) { + // This is attempting to not disable read/modify/write + // combining for the operation. + if (MayFoldIntoReadModifyWrite(Op, N1)) + return false; + + // The load in N1 is only foldable if N0 has a single use, and is + // not a constant. Otherwise, to find a register to modify, it most + // likely is going to use the N1 load's register as the destination. + if (!isa(N0)) { + // Truncates are effectively not instructions, so check the + // truncate's operand, but only in the case where the TRUNCATE doesn't + // already have multiple uses. The point of this is that a single-use + // NOP type of instruction shouldn't act as if that value could be used + // as a modifiable reg once this goes to register allocation. + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) + N0 = N0.getOperand(0); + + if (N0.hasOneUse()) + return false; + } + } Promote = true; + break; } } Index: test/CodeGen/X86/atom-lea-addw-bug.ll =================================================================== --- test/CodeGen/X86/atom-lea-addw-bug.ll +++ test/CodeGen/X86/atom-lea-addw-bug.ll @@ -1,19 +1,31 @@ +; This test is supposed to check some code in X86FixupLEAs.cpp that only gets +; triggered when a 16bit register-register add is created and the two registers +; are not the same actual register. This code change was committed in +; LLVM revision 191711 of that file. +; +; However, with changes to improve 16 bit promotion to cut down on partial +; register loads, I could find no way to get that instruction to be +; generated any longer. So, this test has become mostly useless, but rather +; than removing it, I changed it to verify to find leal, which it consistently +; generates. That also gives the opportunity for a future developer to +; see the comment on this test. +; ; RUN: llc < %s -mcpu=atom | FileCheck %s ; ModuleID = 'bugpoint-reduced-simplified.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-darwin12.5.0" -define i32 @DoLayout() { +define i32 @DoLayout(i16* %p1, i16* %p2) { entry: - %tmp1 = load i16, i16* undef, align 2 - %tmp17 = load i16, i16* null, align 2 - %tmp19 = load i16, i16* undef, align 2 - %shl = shl i16 %tmp19, 1 + %tmp1 = load i16, i16* %p1, align 2 + %tmp17 = load i16, i16* %p2, align 2 + %shl = shl i16 %tmp1, 1 %add55 = add i16 %tmp17, %tmp1 %add57 = add i16 %add55, %shl %conv60 = zext i16 %add57 to i32 %add61 = add nsw i32 %conv60, 0 %conv63 = and i32 %add61, 65535 ret i32 %conv63 -; CHECK: addw +; CHECK: leal } Index: test/CodeGen/X86/simp-loadfold16.ll =================================================================== --- test/CodeGen/X86/simp-loadfold16.ll +++ test/CodeGen/X86/simp-loadfold16.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s | FileCheck %s +; +; This test checks to make sure that operations with load operands that +; should be foldable into the arithmetic operations don't +; get promoted into longer forms. + +; ModuleID = 'simp-loadfold16.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-linux" + +define void @myf2(i16 *%p1, i8 *%p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; Should be folded into add + %t2 = load i8, i8* %p2, align 1 + %t3 = zext i8 %t2 to i16 + %t4 = add i16 %t1, %t3 ; Should stay addw to allow folding. + store i16 %t4, i16* %p3, align 2 ; Should be a 16 bit store. + ret void +; CHECK-LABEL: myf2: +; CHECK: addw +; CHECK-SAME: (%rdi) +; CHECK: movw +} + +define void @myf3(i16 *%p1, i8 *%p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; Should be folded into add + %t2 = load i8, i8* %p2, align 1 + %t3 = zext i8 %t2 to i16 + %t4 = add i16 %t3, %t1 ; Should stay addw to allow folding. + store i16 %t4, i16* %p3, align 2 ; Should be a 16 bit store. + ret void +; CHECK-LABEL: myf3: +; CHECK: addw +; CHECK-SAME: (%rdi) +; CHECK: movw +} + +define void @myf4(i16 *%p1, i8 *%p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; Should be folded into sub + %t2 = load i8, i8* %p2, align 1 + %t3 = zext i8 %t2 to i16 + %t4 = sub i16 %t3, %t1 ; Should stay subw to allow folding. + store i16 %t4, i16* %p3, align 2 ; Should be a 16 bit store. + ret void +; CHECK-LABEL: myf4: +; CHECK: subw +; CHECK-SAME: (%rdi) +; CHECK: movw +} + +define void @myf5(i16 *%p1, i8 *%p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; Should be folded into or + %t2 = load i8, i8* %p2, align 1 + %t3 = zext i8 %t2 to i16 + %t4 = or i16 %t1, %t3 ; Should stay orw to allow folding. + store i16 %t4, i16* %p3, align 2 ; Should be a 16 bit store. + ret void +; CHECK-LABEL: myf5: +; CHECK: orw +; CHECK-SAME: (%rdi) +; CHECK: movw +} + +define void @myf6(i16 *%p1, i8 *%p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; Should be folded into or + %t2 = load i8, i8* %p2, align 1 + %t3 = zext i8 %t2 to i16 + %t4 = or i16 %t3, %t1 ; Should stay orw to allow folding. + store i16 %t4, i16* %p3, align 2 ; Should be a 16 bit store. + ret void +; CHECK-LABEL: myf6: +; CHECK: orw +; CHECK-SAME: (%rdi) +; CHECK: movw +} + +define void @myf7(i16 *%p1, i8 *%p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; Should be folded into and + %t2 = load i8, i8* %p2, align 1 + %t3 = zext i8 %t2 to i16 + %t4 = and i16 %t1, %t3 ; Should stay andw to allow folding. + store i16 %t4, i16* %p3, align 2 ; Should be a 16 bit store. + ret void +; CHECK-LABEL: myf7: +; CHECK: andw +; CHECK-SAME: (%rdi) +; CHECK: movw +} + +define void @myf8(i16 *%p1, i8 *%p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; Should be folded into and + %t2 = load i8, i8* %p2, align 1 + %t3 = zext i8 %t2 to i16 + %t4 = and i16 %t3, %t1 ; Should stay andw to allow folding. + store i16 %t4, i16* %p3, align 2 ; Should be a 16 bit store. + ret void +; CHECK-LABEL: myf8: +; CHECK: andw +; CHECK-SAME: (%rdi) +; CHECK: movw +} + +define void @myf9(i16 *%p1, i8 *%p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; Should be folded into xor + %t2 = load i8, i8* %p2, align 1 + %t3 = zext i8 %t2 to i16 + %t4 = xor i16 %t1, %t3 ; Should stay xorw to allow folding. + store i16 %t4, i16* %p3, align 2 ; Should be a 16 bit store. + ret void +; CHECK-LABEL: myf9: +; CHECK: xorw +; CHECK-SAME: (%rdi) +; CHECK: movw +} + +define void @myf10(i16 *%p1, i8 *%p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; Should be folded into xor + %t2 = load i8, i8* %p2, align 1 + %t3 = zext i8 %t2 to i16 + %t4 = xor i16 %t3, %t1 ; Should stay xorw to allow folding. + store i16 %t4, i16* %p3, align 2 ; Should be a 16 bit store. + ret void +; CHECK-LABEL: myf10: +; CHECK: xorw +; CHECK-SAME: (%rdi) +; CHECK: movw +} Index: test/CodeGen/X86/simp-no-rmw1.ll =================================================================== --- test/CodeGen/X86/simp-no-rmw1.ll +++ test/CodeGen/X86/simp-no-rmw1.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s | FileCheck %s +; +; This test checks to make sure that patterns that look kind of like +; read modify write operations might be OK, but where the load and store +; addresses differ get properly promoted to 32 bit operations so that +; many fewer partial register operations are used. +; + +; ModuleID = 'simp-no-rmw1.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-linux" + +define void @myf1(i16 *%p1, i16 *%p2) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = shl i16 %t1, 5 + store i16 %t2, i16* %p2, align 2 + ret void +; CHECK-LABEL: myf1: +; CHECK movzwl +; CHECK: shll +; CHECK: movw +} + +define void @myf2(i16 *%p1, i16 *%p2) { +entry: + %t1 = load i16, i16* %p1, align 2 ; This should become a movzwl + %t2 = add i16 %t1, 357 ; This should be a 32 bit add or lea. + store i16 %t2, i16* %p2, align 2 ; this should be a 16 bit store. + ret void +; CHECK-LABEL: myf2: +; CHECK: movzwl +; CHECK: addl +; CHECK: movw +} + +define void @myf3(i16 *%p1, i16 %p2, i16 *%p3) { +entry: + %t1 = load i16, i16* %p1, align 2 ; This should become a movzwl + %t2 = sub i16 %t1, %p2 ; This should be a 32 bit sub. + store i16 %t2, i16* %p3, align 2 ; This should be a 16 bit store. + ret void +; CHECK-LABEL: myf3: +; CHECK: movzwl +; CHECK: subl +; CHECK: movw +} + +define void @myf4(i16 *%p1, i16 *%p2) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = or i16 %t1, 263 + store i16 %t2, i16* %p2, align 2 + ret void +; CHECK-LABEL: myf4: +; CHECK: movzwl +; CHECK: orl +; CHECK: movw +} + +define void @myf5(i16 *%p1, i16 *%p2) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = xor i16 %t1, 263 + store i16 %t2, i16* %p2, align 2 + ret void +; CHECK-LABEL: myf5: +; CHECK: movzwl +; CHECK: xorl +; CHECK: movw +} + +define void @myf6(i16 *%p1, i16 *%p2) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = and i16 %t1, 258 + store i16 %t2, i16* %p2, align 2 + ret void +; CHECK-LABEL: myf6: +; CHECK: movzwl +; CHECK: andl +; CHECK: movw +} Index: test/CodeGen/X86/simp-promote1.ll =================================================================== --- test/CodeGen/X86/simp-promote1.ll +++ test/CodeGen/X86/simp-promote1.ll @@ -0,0 +1,66 @@ +; RUN: llc < %s | FileCheck %s +; +; This test checks to make sure that certain operations get promoted from +; 16 bit operations into 32 bit operations. This is desirable in order to +; decrease instances of partial register usage, which hurts performance. +; + +; ModuleID = 'simp-promote1.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-linux" + +define i32 @myf1(i32* %p1, i16* %p2, i16* %p3) { +entry: + %t1 = load i32, i32* %p1, align 4 + %t2 = trunc i32 %t1 to i16 + %t3 = load i16, i16* %p2, align 2 + %t4 = or i16 %t3, %t2 + store i16 %t4, i16* %p3, align 2 + ret i32 %t1 +; CHECK-LABEL: myf1: +; CHECK movzwl +; CHECK: orl +; CHECK: movw +} + +define i32 @myf2(i32* %p1, i16* %p2, i16* %p3) { +entry: + %t1 = load i32, i32* %p1, align 4 + %t2 = trunc i32 %t1 to i16 + %t3 = load i16, i16* %p2, align 2 + %t4 = xor i16 %t2, %t3 + store i16 %t4, i16* %p3, align 2 + ret i32 %t1 +; CHECK-LABEL: myf2: +; CHECK movzwl +; CHECK: xorl +; CHECK: movw +} + +define i32 @myf3(i32* %p1, i16* %p2, i16* %p3) { +entry: + %t1 = load i32, i32* %p1, align 4 + %t2 = trunc i32 %t1 to i16 + %t3 = load i16, i16* %p2, align 2 + %t4 = shl i16 %t2, %t3 + store i16 %t4, i16* %p3, align 2 + ret i32 %t1 +; CHECK-LABEL: myf3: +; CHECK movzwl +; CHECK: shll +; CHECK: movw +} + +define i32 @myf4(i32* %p1, i16* %p2, i16* %p3) { +entry: + %t1 = load i32, i32* %p1, align 4 + %t2 = trunc i32 %t1 to i16 + %t3 = load i16, i16* %p2, align 2 + %t4 = lshr i16 %t2, %t3 + store i16 %t4, i16* %p3, align 2 + ret i32 %t1 +; CHECK-LABEL: myf4: +; CHECK movzwl +; CHECK: shrl +; CHECK: movw +} Index: test/CodeGen/X86/simp-rmw-vol.ll =================================================================== --- test/CodeGen/X86/simp-rmw-vol.ll +++ test/CodeGen/X86/simp-rmw-vol.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s | FileCheck %s +; +; This test checks to make sure that Read-Modify-Write instructions still +; get generated for some 16 bit operations on volatile memory. +; This makes sure that the new +; code in IsDesirableToPromoteOp doesn't break read-modify-write instruction +; creation for 16 bit operations. + +; ModuleID = 'simp-rmw-vol.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-darwin12.5.0" + +define void @myf1(i16 *%p1) { +entry: + %t1 = load volatile i16, i16* %p1, align 2 + %t2 = shl i16 %t1, 5 + store volatile i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf1: +; CHECK: shlw +; CHECK-SAME: (%rdi) +} + +define void @myf2(i16 *%p1) { +entry: + %t1 = load volatile i16, i16* %p1, align 2 + %t2 = or i16 %t1, 263 + store volatile i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf2: +; CHECK: orw +; CHECK-SAME: (%rdi) +} Index: test/CodeGen/X86/simp-rmw1.ll =================================================================== --- test/CodeGen/X86/simp-rmw1.ll +++ test/CodeGen/X86/simp-rmw1.ll @@ -0,0 +1,103 @@ +; RUN: llc < %s | FileCheck %s +; +; This test checks to make sure that Read-Modify-Write instructions still +; get generated for 16 bit operations. This makes sure that the new +; code in IsDesirableToPromoteOp doesn't break read-modify-write instruction +; creation for 16 bit operations. + +; ModuleID = 'simp-rmw1.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-darwin12.5.0" + +define void @myf1(i16 *%p1) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = shl i16 %t1, 5 + store i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf1: +; CHECK: shlw +; CHECK-SAME: (%rdi) +} + +define void @myf2(i16 *%p1, i16 *%p2) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t11 = load i16, i16 *%p2, align 2 ; This should get promoted to a movzwl. + %t2 = add i16 %t1, %t11 ; This should be a RMW addw. + store i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf2: +; FIXME: movzwl - This improvement not implemented yet. +; CHECK: addw +; CHECK-SAME: (%rdi) +} + +define void @myf3(i16 *%p1, i16 *%p2) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t11 = load i16, i16* %p2, align 2 ; This should get promoted to a movzwl. + %t2 = sub i16 %t1, %t11 ; This should be a RMW subw. + store i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf3: +; FIXME: movzwl - This improvement not implemented yet. +; CHECK: subw +; CHECK-SAME: (%rdi) +} + +define void @myf4(i16 *%p1) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = or i16 %t1, 263 + store i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf4: +; CHECK: orw +; CHECK-SAME: (%rdi) +} + +define void @myf5(i16 *%p1) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = xor i16 %t1, 263 + store i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf5: +; CHECK: xorw +; CHECK-SAME: (%rdi) +} + +define void @myf6(i16 *%p1) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = and i16 %t1, 258 + store i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf6: +; CHECK: andw +; CHECK-SAME: (%rdi) +} + +define void @myf7(i16 *%p1) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = lshr i16 %t1, 5 + store i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf7: +; CHECK: shrw +; CHECK-SAME: (%rdi) +} + +define void @myf8(i16 *%p1) { +entry: + %t1 = load i16, i16* %p1, align 2 + %t2 = ashr i16 %t1, 5 + store i16 %t2, i16* %p1, align 2 + ret void +; CHECK-LABEL: myf8: +; CHECK: sarw +; CHECK-SAME: (%rdi) +} +