diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1412,11 +1412,20 @@ Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, DemandedElts, - Known2, TLO, Depth + 1)) + APInt DemandedBits0 = ~Known.One & DemandedBits; + if (SimplifyDemandedBits(Op0, DemandedBits0, DemandedElts, Known2, TLO, + Depth + 1)) return true; assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); + // Try Op1 again now we have the KnownBits of Op0. + APInt DemandedBits1 = ~(Known2.One & DemandedBits0) & DemandedBits; + if (DemandedBits1 != DemandedBits) + if (SimplifyDemandedBits(Op1, DemandedBits1, DemandedElts, Known, TLO, + Depth + 1)) + return true; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + // Attempt to avoid multi-use ops if we don't need anything from them. if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) { SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -191,23 +191,20 @@ ret void } -; FIXME here should have been "v_perm_b32" with 0xffff0500 mask. define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) { ; GCN-LABEL: and_or_or_and: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, 0xffff0500 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_and_b32 s0, s0, 0xff00 -; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 -; GCN-NEXT: v_or_b32_e32 v2, s0, v2 +; GCN-NEXT: v_perm_b32 v2, s0, v2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -277,30 +274,27 @@ ret void } -; FIXME here should have been "v_perm_b32" with 0xffff0500 mask. define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) { ; GCN-LABEL: known_ffff0500: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v5, 0xffff8004 +; GCN-NEXT: v_mov_b32_e32 v5, 0xffff0500 +; GCN-NEXT: v_mov_b32_e32 v6, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v4, v[0:1] ; GCN-NEXT: s_bitset1_b32 s0, 15 -; GCN-NEXT: s_and_b32 s0, s0, 0xff00 -; GCN-NEXT: s_or_b32 s0, s0, 0xffff0000 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or_b32_e32 v4, 4, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff00ff, v4 -; GCN-NEXT: v_or_b32_e32 v4, s0, v4 +; GCN-NEXT: v_perm_b32 v4, s0, v4, v5 ; GCN-NEXT: flat_store_dword v[0:1], v4 -; GCN-NEXT: flat_store_dword v[2:3], v5 +; GCN-NEXT: flat_store_dword v[2:3], v6 ; GCN-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x()