diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8971,8 +8971,8 @@ // (and (srl x, (sub c1, c2), MASK) // Only fold this if the inner shift has no other uses -- if it does, // folding this will increase the total number of instructions. - // TODO - drop hasOneUse requirement if c1 == c2? - if (N0.getOpcode() == ISD::SRL && N0.hasOneUse() && + if (N0.getOpcode() == ISD::SRL && + (N0.getOperand(1) == N1 || N0.hasOneUse()) && TLI.shouldFoldConstantShiftPairToMask(N, Level)) { if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount, /*AllowUndefs*/ false, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -644,6 +644,10 @@ bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; + /// Return true if it is profitable to fold a pair of shifts into a mask. + bool shouldFoldConstantShiftPairToMask(const SDNode *N, + CombineLevel Level) const override; + /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool shouldConvertConstantLoadToIntImm(const APInt &Imm, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13302,6 +13302,17 @@ return true; } +bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( + const SDNode *N, CombineLevel Level) const { + assert(((N->getOpcode() == ISD::SHL && + N->getOperand(0).getOpcode() == ISD::SRL) || + (N->getOpcode() == ISD::SRL && + N->getOperand(0).getOpcode() == ISD::SHL)) && + "Expected shift-shift mask"); + // Don't allow multiuse shift folding with the same shift amount. + return N->getOperand(0)->hasOneUse(); +} + bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -131,17 +131,17 @@ ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: s_lshl_b32 s2, s1, 16 -; CI-NEXT: s_or_b32 s0, s0, s2 +; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s0, s0, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_lshr_b32 s2, s2, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s2 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -595,12 +595,12 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_mov_b32_e32 v2, 0 -; GFX803-NEXT: ds_write_b16 v2, v1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_mov_b32_e32 v3, 0 +; GFX803-NEXT: ds_write_b16 v3, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -647,12 +647,12 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: ds_write_b16 v2, v0 ; GFX803-NEXT: ds_write_b16 v3, v1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -38,8 +38,8 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -85,8 +85,8 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -106,9 +106,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 @@ -123,9 +123,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0 +; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -148,9 +147,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 @@ -165,9 +164,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0 +; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll --- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll +++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll @@ -25,15 +25,15 @@ define hidden fastcc %struct.LIST_HELP* @test(%struct.PROOFSEARCH_HELP* %Search, %struct.LIST_HELP* %ClauseList, i32 %Level, %struct.LIST_HELP** nocapture %New) { ; CHECK-LABEL: test: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: sub.w r9, r2, #32 +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: sub.w r7, r2, #32 ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: add.w r6, r0, r9, lsr #5 +; CHECK-NEXT: add.w r6, r0, r7, lsr #5 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: lsr.w r7, r9, #5 -; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_1: @ %for.inc ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 @@ -47,7 +47,7 @@ ; CHECK-NEXT: add.w r0, r0, r6, lsl #2 ; CHECK-NEXT: ldr r0, [r0, #40] ; CHECK-NEXT: it hi -; CHECK-NEXT: subhi.w r2, r9, r7, lsl #5 +; CHECK-NEXT: andhi r2, r7, #31 ; CHECK-NEXT: lsrs r0, r2 ; CHECK-NEXT: lsls r0, r0, #31 ; CHECK-NEXT: beq .LBB0_1 @@ -55,7 +55,7 @@ ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: bl foo -; CHECK-NEXT: str.w r10, [r5, #4] +; CHECK-NEXT: str.w r9, [r5, #4] ; CHECK-NEXT: b .LBB0_1 entry: %cmp4.i.i = icmp ugt i32 %Level, 31 diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -318,9 +318,8 @@ ; CHECK-LABEL: extract_nxv8i8_nxv1i8_7: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: slli a1, a0, 3 -; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: sub a0, a0, a1 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu ; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll @@ -304,11 +304,10 @@ ; CHECK-LABEL: insert_nxv16i8_nxv1i8_7: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: slli a1, a0, 3 -; CHECK-NEXT: sub a0, a1, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu -; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: sub a1, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vslideup.vx v8, v10, a1 ; CHECK-NEXT: ret %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 7) ret %v @@ -318,11 +317,10 @@ ; CHECK-LABEL: insert_nxv16i8_nxv1i8_15: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: slli a1, a0, 3 -; CHECK-NEXT: sub a0, a1, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu -; CHECK-NEXT: vslideup.vx v9, v10, a0 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: sub a1, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu +; CHECK-NEXT: vslideup.vx v9, v10, a1 ; CHECK-NEXT: ret %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 15) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll @@ -36,9 +36,8 @@ ; CHECK-LABEL: load_nxv7f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a2, a2, 3 -; CHECK-NEXT: slli a3, a2, 3 -; CHECK-NEXT: sub a2, a3, a2 +; CHECK-NEXT: srli a3, a2, 3 +; CHECK-NEXT: sub a2, a2, a3 ; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vse16.v v8, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll @@ -22,9 +22,8 @@ ; CHECK-LABEL: store_nxv7f64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a1, a1, 3 -; CHECK-NEXT: slli a2, a1, 3 -; CHECK-NEXT: sub a1, a2, a1 +; CHECK-NEXT: srli a2, a1, 3 +; CHECK-NEXT: sub a1, a1, a2 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu ; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll --- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll +++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll @@ -123,15 +123,10 @@ define void @fun3(<3 x i31>* %src, <3 x i31>* %p) ; CHECK-LABEL: fun3: ; CHECK: # %bb.0: -; CHECK-NEXT: l %r0, 8(%r2) +; CHECK-NEXT: llgf %r0, 8(%r2) ; CHECK-NEXT: lg %r1, 0(%r2) -; CHECK-NEXT: sllg %r2, %r1, 32 -; CHECK-NEXT: lr %r2, %r0 -; CHECK-NEXT: st %r0, 8(%r3) -; CHECK-NEXT: srlg %r0, %r2, 32 -; CHECK-NEXT: lr %r1, %r0 -; CHECK-NEXT: nihh %r1, 8191 ; CHECK-NEXT: stg %r1, 0(%r3) +; CHECK-NEXT: st %r0, 8(%r3) ; CHECK-NEXT: br %r14 { %tmp = load <3 x i31>, <3 x i31>* %src