diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19009,16 +19009,10 @@ APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), ST->getMemoryVT().getScalarSizeInBits()); - // See if we can simplify the input to this truncstore with knowledge that - // only the low bits are being used. For example: - // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" AddToWorklist(Value.getNode()); - if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits)) - return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), - ST->getMemOperand()); - // Otherwise, see if we can simplify the operation with - // SimplifyDemandedBits, which only works if the value has a single use. + // See if we can simplify the operation with SimplifyDemandedBits, which + // only works if the value has a single use. if (SimplifyDemandedBits(Value, TruncDemandedBits)) { // Re-visit the store if anything changed and the store hasn't been merged // with another node (N is deleted) SimplifyDemandedBits will add Value's @@ -19028,6 +19022,13 @@ AddToWorklist(N); return SDValue(N, 0); } + + // Otherwise, see if we can simplify the input to this truncstore with + // knowledge that only the low bits are being used. For example: + // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" + if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits)) + return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), + ST->getMemOperand()); } // If this is a load followed by a store to the same location, then the store diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2478,24 +2478,6 @@ return getConstant(NewVal, SDLoc(V), V.getValueType()); break; } - case ISD::SRL: - // Only look at single-use SRLs. - if (!V.getNode()->hasOneUse()) - break; - if (auto *RHSC = dyn_cast(V.getOperand(1))) { - // See if we can recursively simplify the LHS. - unsigned Amt = RHSC->getZExtValue(); - - // Watch out for shift count overflow though. - if (Amt >= DemandedBits.getBitWidth()) - break; - APInt SrcDemandedBits = DemandedBits << Amt; - if (SDValue SimplifyLHS = TLI->SimplifyMultipleUseDemandedBits( - V.getOperand(0), SrcDemandedBits, *this)) - return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS, - V.getOperand(1)); - } - break; } return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1792,6 +1792,16 @@ Known.One.lshrInPlace(ShAmt); // High bits known zero. Known.Zero.setHighBits(ShAmt); + + // Attempt to avoid multi-use ops if we don't need anything from them. + if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) { + SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( + Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1); + if (DemandedOp0) { + SDValue NewOp = TLO.DAG.getNode(ISD::SRL, dl, VT, DemandedOp0, Op1); + return TLO.CombineTo(Op, NewOp); + } + } } break; } diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll --- a/llvm/test/CodeGen/AArch64/parity.ll +++ b/llvm/test/CodeGen/AArch64/parity.ll @@ -47,8 +47,8 @@ ; CHECK-LABEL: parity_17: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0x1ffff -; CHECK-NEXT: eor w8, w8, w8, lsr #16 -; CHECK-NEXT: eor w8, w8, w8, lsr #8 +; CHECK-NEXT: eor w9, w8, w8, lsr #16 +; CHECK-NEXT: eor w8, w9, w8, lsr #8 ; CHECK-NEXT: eor w8, w8, w8, lsr #4 ; CHECK-NEXT: eor w8, w8, w8, lsr #2 ; CHECK-NEXT: eor w8, w8, w8, lsr #1 diff --git a/llvm/test/CodeGen/AArch64/shift-accumulate.ll b/llvm/test/CodeGen/AArch64/shift-accumulate.ll --- a/llvm/test/CodeGen/AArch64/shift-accumulate.ll +++ b/llvm/test/CodeGen/AArch64/shift-accumulate.ll @@ -92,8 +92,8 @@ define <1 x i64> @ssra_v1i64(<2 x i32> %0) { ; CHECK-LABEL: ssra_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bic v0.2s, #64, lsl #24 ; CHECK-NEXT: ushr d1, d0, #63 +; CHECK-NEXT: bic v0.2s, #64, lsl #24 ; CHECK-NEXT: ssra d1, d0, #62 ; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret @@ -108,8 +108,8 @@ define <2 x i64> @ssra_v2i64(<4 x i32> %0) { ; CHECK-LABEL: ssra_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: bic v0.4s, #64, lsl #24 ; CHECK-NEXT: ushr v1.2d, v0.2d, #63 +; CHECK-NEXT: bic v0.4s, #64, lsl #24 ; CHECK-NEXT: ssra v1.2d, v0.2d, #62 ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -463,10 +463,10 @@ ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v2 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v2i16: @@ -530,12 +530,12 @@ ; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -209,27 +209,27 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 -; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 -; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:3 -; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:5 +; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 +; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 +; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 +; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 ; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s1 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v5 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v6 offset:5 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v1 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v2 offset:1 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 -; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v4 offset:3 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align1: @@ -294,19 +294,19 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6 -; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:2 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align2: @@ -399,12 +399,14 @@ ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 @@ -495,16 +497,16 @@ ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:10 -; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:10 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:4 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:8 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:8 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:10 ; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 ; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -12,9 +12,9 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -29,10 +29,9 @@ ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB0_3 ; SI-NEXT: s_branch .LBB0_4 @@ -48,9 +47,9 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -63,10 +62,9 @@ ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: .LBB0_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 @@ -83,11 +81,10 @@ ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v4 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16: @@ -161,28 +158,26 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: s_branch .LBB1_4 ; SI-NEXT: .LBB1_2: -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB1_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -197,25 +192,23 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_or_b32_e32 v5, v5, v1 ; SI-NEXT: .LBB1_4: ; %exit -; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff ; SI-NEXT: v_mov_b32_e32 v5, 0x8000 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 @@ -224,14 +217,14 @@ ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v4 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16_2: @@ -314,19 +307,18 @@ ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: s_branch .LBB2_4 ; SI-NEXT: .LBB2_2: -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB2_3: ; %T @@ -355,12 +347,11 @@ ; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: .LBB2_4: ; %exit -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 ; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 @@ -444,9 +435,9 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -477,10 +468,9 @@ ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB3_3 ; SI-NEXT: s_branch .LBB3_4 @@ -496,9 +486,9 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -527,10 +517,9 @@ ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: .LBB3_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 @@ -547,11 +536,10 @@ ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v4 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xi16_extract_4xi16: @@ -641,13 +629,13 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -665,20 +653,18 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB4_3 ; SI-NEXT: s_branch .LBB4_4 ; SI-NEXT: .LBB4_2: ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB4_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -695,9 +681,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -718,16 +704,14 @@ ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; SI-NEXT: .LBB4_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff ; SI-NEXT: v_mov_b32_e32 v5, 0x8000 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 @@ -743,7 +727,7 @@ ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xi16_extract_4xi16_2: @@ -858,19 +842,18 @@ ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v3 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB5_3 ; SI-NEXT: s_branch .LBB5_4 ; SI-NEXT: .LBB5_2: -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB5_3: ; %T @@ -915,12 +898,11 @@ ; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: .LBB5_4: ; %exit -; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 ; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -535,7 +535,7 @@ ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfe_u32 v1, v0, 16, 15 -; CI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; CI-NEXT: flat_store_short v[0:1], v0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_short v[0:1], v1 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -802,14 +802,14 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_or_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; SI-NEXT: v_or_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i16: @@ -1021,17 +1021,17 @@ ; SI-NEXT: v_or_b32_e32 v4, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 -; SI-NEXT: v_or_b32_e32 v4, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v6, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -963,34 +963,30 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 +; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8 ; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-NEXT: v_ashrrev_i32_e32 v5, 24, v2 +; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8 ; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX7-NEXT: v_ashrrev_i32_e32 v8, 24, v0 +; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-NEXT: v_alignbit_b32 v2, 0, v2, 16 +; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 -; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v8, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1851,27 +1851,23 @@ ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v6, 0, v6, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2143,18 +2139,16 @@ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 -; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; GFX9-NODL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 +; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2173,18 +2167,16 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; GFX9-DL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -2203,24 +2195,23 @@ ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DL-NEXT: v_lshrrev_b16 v8, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX10-DL-NEXT: v_lshrrev_b16 v9, 8, v2 ; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1 -; GFX10-DL-NEXT: v_mul_lo_u16 v9, v6, v7 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: v_mul_lo_u16 v5, v7, v8 +; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9 ; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4 -; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v8 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 -; GFX10-DL-NEXT: v_mad_u16 v1, v6, v7, v1 +; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2204,60 +2204,48 @@ ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4 -; GFX7-NEXT: v_bfe_i32 v5, v2, 4, 4 -; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_bfe_i32 v8, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX7-NEXT: v_or_b32_e32 v6, v11, v10 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1 -; GFX7-NEXT: v_bfe_i32 v7, v2, 24, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2 ; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; GFX7-NEXT: v_mad_u32_u24 v1, v16, v11, v1 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2844,84 +2832,50 @@ ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v2 -; GFX7-NEXT: v_bfe_i32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_i32 v5, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_i32 v7, v2, 12, 4 -; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 -; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4 -; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 12, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 -; GFX7-NEXT: v_bfe_i32 v16, v0, 4, 4 -; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v8, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v10 -; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12 -; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v15 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v16 +; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2 +; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0 +; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX7-NEXT: v_or_b32_e32 v7, v9, v8 -; GFX7-NEXT: v_or_b32_e32 v8, v11, v10 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v13 -; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v2 -; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 8 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v4 -; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v5 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 -; GFX7-NEXT: v_bfe_u32 v11, v4, 8, 8 -; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8 -; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v0, v11, v16, v0 -; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v6, v6, 8, 8 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 +; GFX7-NEXT: v_alignbit_b32 v9, 0, v9, 24 +; GFX7-NEXT: v_alignbit_b32 v16, 0, v16, 24 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -3051,6 +3005,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 ; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 @@ -3058,63 +3014,60 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2 ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v14 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10 -; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 -; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 +; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 +; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 ; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v1, v7, v4 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, v7, v4 +; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 ; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v9 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -3144,6 +3097,8 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 @@ -3151,63 +3106,60 @@ ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v1 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v5 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v14 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v1, v7, v4 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v7, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v9 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -3234,68 +3186,67 @@ ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v0, v11 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v5, v12 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9 -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v9, 16, v6 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v11 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3334,78 +3285,77 @@ ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v9, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v7, v14 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 8, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v11 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 8, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v18 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v12, v5, v11 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v5, v12 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v11, v7, v14 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v9 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v11, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v12, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2121,42 +2121,30 @@ ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_u32 v8, v2, 20, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 12, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v7, 15, v2 -; GFX7-NEXT: v_alignbit_b32 v2, v8, v2, 16 -; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000, v9 +; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v0 -; GFX7-NEXT: v_and_b32_e32 v14, 15, v0 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000, v9 -; GFX7-NEXT: v_or_b32_e32 v8, v14, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GFX7-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v8, v1 -; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 20, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 12, 4 -; GFX7-NEXT: v_alignbit_b32 v0, v15, v0, 16 -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v16, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -2478,70 +2466,40 @@ ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 28, v2 -; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v2 +; GFX7-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX7-NEXT: v_bfe_u32 v6, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v8, 0xf00, v8 -; GFX7-NEXT: v_and_b32_e32 v4, 0xf00, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v12, 15, v0 -; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24 -; GFX7-NEXT: v_and_b32_e32 v6, 0xf00, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24 -; GFX7-NEXT: v_and_b32_e32 v8, 0xf00, v11 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_and_b32_e32 v4, 0xf00, v15 -; GFX7-NEXT: v_and_b32_e32 v6, 0xf00, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v0, 0xf0f, v0 -; GFX7-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX7-NEXT: v_and_b32_e32 v2, 0xf0f, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v14, v4 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 15, v3 -; GFX7-NEXT: v_and_b32_e32 v13, 15, v4 -; GFX7-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX7-NEXT: v_bfe_u32 v14, v4, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v14, 15, v0 +; GFX7-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v2 +; GFX7-NEXT: v_bfe_u32 v13, v0, 4, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 12, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 -; GFX7-NEXT: v_and_b32_e32 v9, 15, v2 -; GFX7-NEXT: v_and_b32_e32 v15, 15, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v11, v1 -; GFX7-NEXT: v_bfe_u32 v10, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v16, v0, 8, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v10, v16, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_bfe_u32 v3, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4 +; GFX7-NEXT: v_alignbit_b32 v2, v9, v2, 24 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v15 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_alignbit_b32 v8, 0, v8, 24 +; GFX7-NEXT: v_alignbit_b32 v7, 0, v9, 24 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_u32 v11, v0, 16, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 28, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v7, v1 +; GFX7-NEXT: v_bfe_u32 v10, v0, 20, 4 +; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 24 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v15, v9, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2635,52 +2593,51 @@ ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 ; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 ; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1 +; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4 ; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_u32 v1, v2, 4, 4 ; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 ; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2 +; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v2, v2, 16, 4 +; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2 ; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v17, v9, v16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2 ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v8, v17, v8 ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 ; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v1, v18, v10 +; GFX9-NEXT: v_or_b32_e32 v0, v18, v10 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v5, v5, v12 -; GFX9-NEXT: v_or_b32_e32 v7, v12, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_or_b32_e32 v10, v12, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v1, v5, v4 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v7 +; GFX9-NEXT: v_add_u16_e32 v4, v5, v4 +; GFX9-NEXT: v_add_u16_e32 v1, v4, v1 ; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 +; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -2701,52 +2658,51 @@ ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_u32 v1, v2, 4, 4 ; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 16, 4 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v17, v9, v16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v8, v17, v8 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v10 +; GFX9-DL-NEXT: v_or_b32_e32 v0, v18, v10 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v0 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v8 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_or_b32_e32 v10, v12, v0 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v1, v5, v4 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7 +; GFX9-DL-NEXT: v_add_u16_e32 v4, v5, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v4, v1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v10 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v8 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -2770,53 +2726,52 @@ ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v10 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 ; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 4, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v13 +; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 8, 4 +; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 +; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX10-DL-NEXT: v_mul_lo_u16 v1, v7, v1 ; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v2, v8, v14 -; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-DL-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX10-DL-NEXT: v_mul_lo_u16 v1, v11, v13 -; GFX10-DL-NEXT: v_mul_lo_u16 v7, v9, v15 -; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v0 +; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v9 +; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 +; GFX10-DL-NEXT: v_or_b32_e32 v6, v1, v6 +; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v15 +; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v14 +; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v0 +; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v13 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v10 -; GFX10-DL-NEXT: v_mul_lo_u16 v10, v12, v16 -; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-DL-NEXT: v_or_b32_e32 v7, v7, v2 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v8 -; GFX10-DL-NEXT: v_or_b32_e32 v1, v10, v1 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX10-DL-NEXT: v_mul_lo_u16 v1, v12, v7 +; GFX10-DL-NEXT: v_mul_lo_u16 v11, v10, v16 +; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX10-DL-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX10-DL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v13 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v9 ; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v12, v16, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v12, v7, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7 -; GFX10-DL-NEXT: v_mad_u16 v0, v9, v15, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -140,10 +140,10 @@ ; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 ; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -105,13 +105,12 @@ ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v2, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff00, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v0, v3 +; SI-NEXT: v_or_b32_e32 v0, v2, v3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -122,12 +121,10 @@ ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0 -; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: @@ -146,13 +143,12 @@ ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_or_b32_e32 v2, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff00, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v0, v3 +; SI-NEXT: v_or_b32_e32 v0, v2, v3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -83,11 +83,11 @@ ; GCN-LABEL: v_shl_i128_vk: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_alignbit_b32 v4, v2, v1, 15 +; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 17 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 15, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 ; GCN-NEXT: v_alignbit_b32 v1, v1, v0, 15 -; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 15 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 17, v0 -; GCN-NEXT: v_mov_b32_e32 v2, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 %lhs, 17 ret i128 %shl @@ -110,11 +110,11 @@ ; GCN-LABEL: v_ashr_i128_vk: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashr_i64 v[4:5], v[2:3], 33 -; GCN-NEXT: v_alignbit_b32 v0, v2, v1, 1 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 1 -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_lshl_b64 v[0:1], v[2:3], 31 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], 33 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr i128 %lhs, 33 ret i128 %shl diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -140,10 +140,10 @@ ; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 ; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -144,7 +144,7 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: trunc_v2i64_arg_to_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -116,9 +116,8 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -213,17 +213,17 @@ ; GFX6-LABEL: v_usubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i16: diff --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -90,16 +90,16 @@ ; ; BE-LABEL: i56_or: ; BE: @ %bb.0: -; BE-NEXT: ldr r1, [r0] -; BE-NEXT: strb r1, [r0, #3] -; BE-NEXT: ldrh r2, [r0, #4]! -; BE-NEXT: ldrb r3, [r0, #2] +; BE-NEXT: mov r1, r0 +; BE-NEXT: ldr r0, [r0] +; BE-NEXT: ldrh r2, [r1, #4]! +; BE-NEXT: ldrb r3, [r1, #2] ; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: orr r1, r2, r1, lsl #24 -; BE-NEXT: orr r1, r1, #384 -; BE-NEXT: strb r1, [r0, #2] -; BE-NEXT: lsr r1, r1, #8 -; BE-NEXT: strh r1, [r0] +; BE-NEXT: orr r0, r2, r0, lsl #24 +; BE-NEXT: orr r0, r0, #384 +; BE-NEXT: strb r0, [r1, #2] +; BE-NEXT: lsr r0, r0, #8 +; BE-NEXT: strh r0, [r1] ; BE-NEXT: mov pc, lr %aa = load i56, i56* %a %b = or i56 %aa, 384 @@ -118,16 +118,10 @@ ; ; BE-LABEL: i56_and_or: ; BE: @ %bb.0: -; BE-NEXT: ldr r1, [r0] +; BE-NEXT: ldrh r1, [r0, #4]! ; BE-NEXT: mov r2, #128 -; BE-NEXT: strb r1, [r0, #3] -; BE-NEXT: ldrh r12, [r0, #4]! -; BE-NEXT: ldrb r3, [r0, #2] +; BE-NEXT: orr r1, r1, #1 ; BE-NEXT: strb r2, [r0, #2] -; BE-NEXT: orr r2, r3, r12, lsl #8 -; BE-NEXT: orr r1, r2, r1, lsl #24 -; BE-NEXT: orr r1, r1, #384 -; BE-NEXT: lsr r1, r1, #8 ; BE-NEXT: strh r1, [r0] ; BE-NEXT: mov pc, lr @@ -149,13 +143,10 @@ ; ; BE-LABEL: i56_insert_bit: ; BE: @ %bb.0: -; BE-NEXT: ldr r2, [r0] -; BE-NEXT: strb r2, [r0, #3] -; BE-NEXT: ldrh r12, [r0, #4]! -; BE-NEXT: ldrb r3, [r0, #2] -; BE-NEXT: orr r3, r3, r12, lsl #8 -; BE-NEXT: orr r2, r3, r2, lsl #24 -; BE-NEXT: bic r2, r2, #8192 +; BE-NEXT: ldrh r2, [r0, #4]! +; BE-NEXT: mov r3, #57088 +; BE-NEXT: orr r3, r3, #16711680 +; BE-NEXT: and r2, r3, r2, lsl #8 ; BE-NEXT: orr r1, r2, r1, lsl #13 ; BE-NEXT: lsr r1, r1, #8 ; BE-NEXT: strh r1, [r0] diff --git a/llvm/test/CodeGen/ARM/parity.ll b/llvm/test/CodeGen/ARM/parity.ll --- a/llvm/test/CodeGen/ARM/parity.ll +++ b/llvm/test/CodeGen/ARM/parity.ll @@ -47,8 +47,8 @@ ; CHECK-LABEL: parity_17: ; CHECK: @ %bb.0: ; CHECK-NEXT: bfc r0, #17, #15 -; CHECK-NEXT: eor r0, r0, r0, lsr #16 -; CHECK-NEXT: eor r0, r0, r0, lsr #8 +; CHECK-NEXT: eor r1, r0, r0, lsr #16 +; CHECK-NEXT: eor r0, r1, r0, lsr #8 ; CHECK-NEXT: eor r0, r0, r0, lsr #4 ; CHECK-NEXT: eor r0, r0, r0, lsr #2 ; CHECK-NEXT: eor r0, r0, r0, lsr #1 diff --git a/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll b/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll --- a/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll +++ b/llvm/test/CodeGen/PowerPC/fp-to-int-to-fp.ll @@ -84,35 +84,35 @@ ; PPC64-NEXT: addi 3, 5, 0 ; PPC64-NEXT: .LBB2_2: # %entry ; PPC64-NEXT: sradi 4, 3, 53 -; PPC64-NEXT: clrldi 5, 3, 63 +; PPC64-NEXT: rldicl 5, 3, 63, 1 ; PPC64-NEXT: addi 4, 4, 1 +; PPC64-NEXT: clrldi 6, 3, 63 ; PPC64-NEXT: cmpldi 4, 1 -; PPC64-NEXT: rldicl 4, 3, 63, 1 -; PPC64-NEXT: or 5, 5, 4 -; PPC64-NEXT: rldicl 6, 5, 11, 53 -; PPC64-NEXT: addi 6, 6, 1 -; PPC64-NEXT: clrldi 7, 5, 53 -; PPC64-NEXT: cmpldi 1, 6, 1 -; PPC64-NEXT: clrldi 6, 3, 53 +; PPC64-NEXT: clrldi 4, 3, 53 +; PPC64-NEXT: or 6, 6, 5 +; PPC64-NEXT: clrldi 7, 6, 53 +; PPC64-NEXT: addi 4, 4, 2047 ; PPC64-NEXT: addi 7, 7, 2047 -; PPC64-NEXT: addi 6, 6, 2047 -; PPC64-NEXT: or 4, 7, 4 -; PPC64-NEXT: or 6, 6, 3 -; PPC64-NEXT: rldicl 4, 4, 53, 11 -; PPC64-NEXT: rldicr 6, 6, 0, 52 +; PPC64-NEXT: or 4, 4, 3 +; PPC64-NEXT: or 5, 7, 5 +; PPC64-NEXT: rldicl 7, 3, 10, 54 +; PPC64-NEXT: rldicr 4, 4, 0, 52 +; PPC64-NEXT: addi 7, 7, 1 ; PPC64-NEXT: bc 12, 1, .LBB2_4 ; PPC64-NEXT: # %bb.3: # %entry -; PPC64-NEXT: ori 6, 3, 0 +; PPC64-NEXT: ori 4, 3, 0 ; PPC64-NEXT: b .LBB2_4 ; PPC64-NEXT: .LBB2_4: # %entry -; PPC64-NEXT: rldicl 4, 4, 11, 1 -; PPC64-NEXT: cmpdi 3, 0 -; PPC64-NEXT: std 6, -32(1) -; PPC64-NEXT: bc 12, 5, .LBB2_6 +; PPC64-NEXT: rldicl 5, 5, 53, 11 +; PPC64-NEXT: std 4, -32(1) +; PPC64-NEXT: rldicl 4, 5, 11, 1 +; PPC64-NEXT: cmpldi 7, 1 +; PPC64-NEXT: bc 12, 1, .LBB2_6 ; PPC64-NEXT: # %bb.5: # %entry -; PPC64-NEXT: ori 4, 5, 0 +; PPC64-NEXT: ori 4, 6, 0 ; PPC64-NEXT: b .LBB2_6 ; PPC64-NEXT: .LBB2_6: # %entry +; PPC64-NEXT: cmpdi 3, 0 ; PPC64-NEXT: std 4, -24(1) ; PPC64-NEXT: bc 12, 0, .LBB2_8 ; PPC64-NEXT: # %bb.7: # %entry diff --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll --- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll @@ -85,7 +85,7 @@ ; ; RV64I-LABEL: test_bswap_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -491,7 +491,7 @@ ; ; RV64I-LABEL: test_bitreverse_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -218,7 +218,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -285,7 +285,7 @@ ; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: addiw a2, a2, 1365 ; RV64M-NEXT: and a1, a1, a2 -; RV64M-NEXT: subw a0, a0, a1 +; RV64M-NEXT: sub a0, a0, a1 ; RV64M-NEXT: lui a1, 209715 ; RV64M-NEXT: addiw a1, a1, 819 ; RV64M-NEXT: and a2, a0, a1 @@ -683,7 +683,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -739,7 +739,7 @@ ; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: addiw a2, a2, 1365 ; RV64M-NEXT: and a1, a1, a2 -; RV64M-NEXT: subw a0, a0, a1 +; RV64M-NEXT: sub a0, a0, a1 ; RV64M-NEXT: lui a1, 209715 ; RV64M-NEXT: addiw a1, a1, 819 ; RV64M-NEXT: and a2, a0, a1 @@ -1214,7 +1214,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -1297,7 +1297,7 @@ ; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: addiw a2, a2, 1365 ; RV64M-NEXT: and a1, a1, a2 -; RV64M-NEXT: subw a0, a0, a1 +; RV64M-NEXT: sub a0, a0, a1 ; RV64M-NEXT: lui a1, 209715 ; RV64M-NEXT: addiw a1, a1, 819 ; RV64M-NEXT: and a2, a0, a1 @@ -1805,7 +1805,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -1877,7 +1877,7 @@ ; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: addiw a2, a2, 1365 ; RV64M-NEXT: and a1, a1, a2 -; RV64M-NEXT: subw a0, a0, a1 +; RV64M-NEXT: sub a0, a0, a1 ; RV64M-NEXT: lui a1, 209715 ; RV64M-NEXT: addiw a1, a1, 819 ; RV64M-NEXT: and a2, a0, a1 @@ -2300,7 +2300,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -2350,7 +2350,7 @@ ; RV64M-NEXT: lui a2, 349525 ; RV64M-NEXT: addiw a2, a2, 1365 ; RV64M-NEXT: and a1, a1, a2 -; RV64M-NEXT: subw a0, a0, a1 +; RV64M-NEXT: sub a0, a0, a1 ; RV64M-NEXT: lui a1, 209715 ; RV64M-NEXT: addiw a1, a1, 819 ; RV64M-NEXT: and a2, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbp-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbp-zbkb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb-zbp-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbp-zbkb.ll @@ -355,11 +355,11 @@ ; CHECK-LABEL: roriw_bug: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a1, a0, 31 -; CHECK-NEXT: andi a0, a0, -2 -; CHECK-NEXT: srli a2, a0, 1 -; CHECK-NEXT: or a1, a1, a2 -; CHECK-NEXT: sext.w a1, a1 -; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: andi a2, a0, -2 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: sext.w a0, a0 +; CHECK-NEXT: xor a0, a2, a0 ; CHECK-NEXT: ret %a = shl i64 %x, 31 %b = and i64 %x, 18446744073709551614 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -29,7 +29,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -83,7 +83,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -146,7 +146,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -204,7 +204,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -273,7 +273,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -380,7 +380,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -423,7 +423,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -465,7 +465,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -520,7 +520,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -622,7 +622,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -660,7 +660,7 @@ ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: lui a1, 209715 ; RV64I-NEXT: addiw a1, a1, 819 ; RV64I-NEXT: and a2, a0, a1 @@ -1028,7 +1028,7 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind { ; RV64I-LABEL: bswap_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -1055,7 +1055,7 @@ define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind { ; RV64I-LABEL: bswap_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: srli a2, a0, 8 ; RV64I-NEXT: lui a3, 16 ; RV64I-NEXT: addiw a3, a3, -256 ; RV64I-NEXT: and a2, a2, a3 diff --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll --- a/llvm/test/CodeGen/RISCV/rv64zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll @@ -2447,7 +2447,7 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind { ; RV64I-LABEL: bswap_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -2473,7 +2473,7 @@ define void @bswap_i32_nosext(i32 signext %a, i32* %x) nounwind { ; RV64I-LABEL: bswap_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: srli a2, a0, 8 ; RV64I-NEXT: lui a3, 16 ; RV64I-NEXT: addiw a3, a3, -256 ; RV64I-NEXT: and a2, a2, a3 @@ -2614,7 +2614,7 @@ define signext i32 @bitreverse_i32(i32 signext %a) nounwind { ; RV64I-LABEL: bitreverse_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a1, a0, 8 +; RV64I-NEXT: srli a1, a0, 8 ; RV64I-NEXT: lui a2, 16 ; RV64I-NEXT: addiw a2, a2, -256 ; RV64I-NEXT: and a1, a1, a2 @@ -2661,7 +2661,7 @@ define void @bitreverse_i32_nosext(i32 signext %a, i32* %x) nounwind { ; RV64I-LABEL: bitreverse_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a2, a0, 8 +; RV64I-NEXT: srli a2, a0, 8 ; RV64I-NEXT: lui a3, 16 ; RV64I-NEXT: addiw a3, a3, -256 ; RV64I-NEXT: and a2, a2, a3 @@ -2780,7 +2780,7 @@ ; RV64I-NEXT: slli a2, a0, 24 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: srliw a2, a0, 24 -; RV64I-NEXT: srliw a0, a0, 16 +; RV64I-NEXT: srli a0, a0, 16 ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: slliw a0, a0, 16 @@ -2801,7 +2801,7 @@ ; RV64I-LABEL: bswap_rotl_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: srliw a1, a0, 24 -; RV64I-NEXT: srliw a2, a0, 16 +; RV64I-NEXT: srli a2, a0, 16 ; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: slli a2, a0, 8 diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -192,7 +192,7 @@ ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: srli a0, a0, 1 ; RV64I-NEXT: and a0, a0, s0 -; RV64I-NEXT: subw a0, a1, a0 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: and a2, a0, s1 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: and a0, a0, s1 diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll --- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll +++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll @@ -76,15 +76,20 @@ ; CHECK-NEXT: stmg %r14, %r15, 112(%r15) ; CHECK-NEXT: .cfi_offset %r14, -48 ; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: vlgvf %r0, %v26, 3 +; CHECK-NEXT: vlgvf %r1, %v26, 3 +; CHECK-NEXT: vlgvf %r0, %v26, 2 +; CHECK-NEXT: stc %r1, 30(%r2) +; CHECK-NEXT: srlk %r3, %r1, 8 +; CHECK-NEXT: risbgn %r1, %r1, 33, 167, 0 ; CHECK-NEXT: vlgvf %r5, %v24, 2 -; CHECK-NEXT: srlk %r1, %r0, 8 +; CHECK-NEXT: rosbg %r1, %r0, 2, 32, 31 +; CHECK-NEXT: sth %r3, 28(%r2) +; CHECK-NEXT: srlg %r1, %r1, 24 ; CHECK-NEXT: vlgvf %r3, %v24, 3 -; CHECK-NEXT: sth %r1, 28(%r2) +; CHECK-NEXT: st %r1, 24(%r2) ; CHECK-NEXT: vlgvf %r1, %v26, 0 ; CHECK-NEXT: risbgn %r14, %r5, 6, 164, 27 ; CHECK-NEXT: sllg %r4, %r3, 60 -; CHECK-NEXT: stc %r0, 30(%r2) ; CHECK-NEXT: rosbg %r14, %r3, 37, 63, 60 ; CHECK-NEXT: sllg %r3, %r14, 8 ; CHECK-NEXT: rosbg %r4, %r1, 4, 34, 29 @@ -98,19 +103,15 @@ ; CHECK-NEXT: rosbg %r5, %r3, 39, 63, 58 ; CHECK-NEXT: sllg %r3, %r5, 8 ; CHECK-NEXT: rosbg %r3, %r4, 56, 63, 8 -; CHECK-NEXT: vlgvf %r4, %v26, 1 ; CHECK-NEXT: stg %r3, 0(%r2) -; CHECK-NEXT: vlgvf %r3, %v26, 2 -; CHECK-NEXT: sllg %r5, %r4, 62 -; CHECK-NEXT: rosbg %r5, %r3, 2, 32, 31 -; CHECK-NEXT: rosbg %r5, %r0, 33, 63, 0 +; CHECK-NEXT: vlgvf %r3, %v26, 1 +; CHECK-NEXT: sllg %r4, %r3, 62 +; CHECK-NEXT: rosbg %r4, %r0, 2, 32, 31 ; CHECK-NEXT: risbgn %r0, %r1, 4, 162, 29 -; CHECK-NEXT: rosbg %r0, %r4, 35, 63, 62 +; CHECK-NEXT: rosbg %r0, %r3, 35, 63, 62 ; CHECK-NEXT: sllg %r0, %r0, 8 -; CHECK-NEXT: rosbg %r0, %r5, 56, 63, 8 +; CHECK-NEXT: rosbg %r0, %r4, 56, 63, 8 ; CHECK-NEXT: stg %r0, 16(%r2) -; CHECK-NEXT: srlg %r0, %r5, 24 -; CHECK-NEXT: st %r0, 24(%r2) ; CHECK-NEXT: lmg %r14, %r15, 112(%r15) ; CHECK-NEXT: br %r14 { @@ -125,8 +126,17 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: llgf %r0, 8(%r2) ; CHECK-NEXT: lg %r1, 0(%r2) +; CHECK-NEXT: sllg %r2, %r1, 32 +; CHECK-NEXT: lr %r2, %r0 +; CHECK-NEXT: risbgn %r2, %r2, 2, 160, 0 +; CHECK-NEXT: lgr %r4, %r2 +; CHECK-NEXT: rosbg %r2, %r1, 0, 1, 32 +; CHECK-NEXT: rosbg %r4, %r0, 33, 63, 0 +; CHECK-NEXT: srlg %r0, %r2, 32 +; CHECK-NEXT: lr %r1, %r0 +; CHECK-NEXT: nihh %r1, 8191 +; CHECK-NEXT: st %r4, 8(%r3) ; CHECK-NEXT: stg %r1, 0(%r3) -; CHECK-NEXT: st %r0, 8(%r3) ; CHECK-NEXT: br %r14 { %tmp = load <3 x i31>, <3 x i31>* %src diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -399,37 +399,36 @@ ; X86: # %bb.0: ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andb $15, %al +; X86-NEXT: andb $8, %al ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: addb %cl, %dl ; X86-NEXT: andb $4, %dl -; X86-NEXT: shlb $3, %cl -; X86-NEXT: andb $8, %cl -; X86-NEXT: orb %dl, %cl -; X86-NEXT: movl %eax, %edx -; X86-NEXT: shrb %dl -; X86-NEXT: andb $2, %dl -; X86-NEXT: orb %cl, %dl +; X86-NEXT: movb %cl, %ah +; X86-NEXT: shlb $3, %ah +; X86-NEXT: andb $8, %ah +; X86-NEXT: orb %dl, %ah +; X86-NEXT: shrb %cl +; X86-NEXT: andb $2, %cl +; X86-NEXT: orb %ah, %cl ; X86-NEXT: shrb $3, %al -; X86-NEXT: orb %dl, %al +; X86-NEXT: orb %cl, %al ; X86-NEXT: retl ; ; X64-LABEL: test_bitreverse_i4: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (%rdi,%rdi), %ecx -; X64-NEXT: leal (,%rdi,8), %edx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $15, %al +; X64-NEXT: andb $8, %al +; X64-NEXT: leal (%rdi,%rdi), %ecx ; X64-NEXT: andb $4, %cl +; X64-NEXT: leal (,%rdi,8), %edx ; X64-NEXT: andb $8, %dl ; X64-NEXT: orb %cl, %dl -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shrb %cl -; X64-NEXT: andb $2, %cl -; X64-NEXT: orb %dl, %cl +; X64-NEXT: shrb %dil +; X64-NEXT: andb $2, %dil +; X64-NEXT: orb %dil, %dl ; X64-NEXT: shrb $3, %al -; X64-NEXT: orb %cl, %al +; X64-NEXT: orb %dl, %al ; X64-NEXT: retq ; ; X86XOP-LABEL: test_bitreverse_i4: diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll --- a/llvm/test/CodeGen/X86/ctpop-combine.ll +++ b/llvm/test/CodeGen/X86/ctpop-combine.ll @@ -88,16 +88,16 @@ ; ; NO-POPCOUNT-LABEL: test4: ; NO-POPCOUNT: # %bb.0: -; NO-POPCOUNT-NEXT: andb $127, %dil -; NO-POPCOUNT-NEXT: movl %edi, %eax -; NO-POPCOUNT-NEXT: shrb %al -; NO-POPCOUNT-NEXT: andb $21, %al -; NO-POPCOUNT-NEXT: subb %al, %dil ; NO-POPCOUNT-NEXT: movl %edi, %ecx +; NO-POPCOUNT-NEXT: andb $127, %cl +; NO-POPCOUNT-NEXT: shrb %dil +; NO-POPCOUNT-NEXT: andb $21, %dil +; NO-POPCOUNT-NEXT: subb %dil, %cl +; NO-POPCOUNT-NEXT: movl %ecx, %eax +; NO-POPCOUNT-NEXT: andb $51, %al +; NO-POPCOUNT-NEXT: shrb $2, %cl ; NO-POPCOUNT-NEXT: andb $51, %cl -; NO-POPCOUNT-NEXT: shrb $2, %dil -; NO-POPCOUNT-NEXT: andb $51, %dil -; NO-POPCOUNT-NEXT: addb %dil, %cl +; NO-POPCOUNT-NEXT: addb %al, %cl ; NO-POPCOUNT-NEXT: movl %ecx, %eax ; NO-POPCOUNT-NEXT: shrb $4, %al ; NO-POPCOUNT-NEXT: addb %cl, %al diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -41,7 +41,7 @@ ; X86-NEXT: shll $16, %ecx ; X86-NEXT: orl %edx, %ecx ; X86-NEXT: orl $384, %ecx # imm = 0x180 -; X86-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 +; X86-NEXT: andl $-128, %ecx ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: retl ; @@ -53,7 +53,7 @@ ; X64-NEXT: shll $16, %ecx ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: orl $384, %ecx # imm = 0x180 -; X64-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 +; X64-NEXT: andl $-128, %ecx ; X64-NEXT: movw %cx, (%rdi) ; X64-NEXT: retq %b = load i24, ptr %a, align 1 @@ -121,12 +121,11 @@ ; X64-NEXT: shll $16, %ecx ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: shlq $32, %rcx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: orq %rcx, %rax -; X64-NEXT: orq $384, %rax # imm = 0x180 -; X64-NEXT: movl %eax, (%rdi) -; X64-NEXT: shrq $32, %rax +; X64-NEXT: movl (%rdi), %edx +; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: orq $384, %rdx # imm = 0x180 ; X64-NEXT: movw %ax, 4(%rdi) +; X64-NEXT: movl %edx, (%rdi) ; X64-NEXT: retq %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 @@ -191,15 +190,14 @@ ; X64-NEXT: shll $16, %edx ; X64-NEXT: orl %ecx, %edx ; X64-NEXT: shlq $32, %rdx -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: movl (%rdi), %esi +; X64-NEXT: orq %rdx, %rsi ; X64-NEXT: shlq $13, %rax ; X64-NEXT: movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF -; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: andq %rsi, %rdx ; X64-NEXT: orq %rax, %rdx +; X64-NEXT: movw %cx, 4(%rdi) ; X64-NEXT: movl %edx, (%rdi) -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: movw %dx, 4(%rdi) ; X64-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, ptr %a, align 1 diff --git a/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll b/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll --- a/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll +++ b/llvm/test/CodeGen/X86/ins_subreg_coalesce-1.ll @@ -5,8 +5,9 @@ ; CHECK-LABEL: t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movzwl 0, %eax -; CHECK-NEXT: orl $2, %eax -; CHECK-NEXT: movw %ax, 0 +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: orl $2, %ecx +; CHECK-NEXT: movw %cx, 0 ; CHECK-NEXT: shrl $3, %eax ; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/load-local-v4i5.ll b/llvm/test/CodeGen/X86/load-local-v4i5.ll --- a/llvm/test/CodeGen/X86/load-local-v4i5.ll +++ b/llvm/test/CodeGen/X86/load-local-v4i5.ll @@ -11,6 +11,9 @@ ; CHECK-NEXT: movb -9(%rsp), %cl ; CHECK-NEXT: movzbl -10(%rsp), %edx ; CHECK-NEXT: movzbl -11(%rsp), %esi +; CHECK-NEXT: movzbl %cl, %edi +; CHECK-NEXT: shrb %cl +; CHECK-NEXT: movb %cl, -2(%rsp) ; CHECK-NEXT: andl $31, %eax ; CHECK-NEXT: andl $31, %esi ; CHECK-NEXT: shll $5, %esi @@ -18,16 +21,12 @@ ; CHECK-NEXT: andl $31, %edx ; CHECK-NEXT: shll $10, %edx ; CHECK-NEXT: orl %esi, %edx -; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shll $15, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: movw %cx, -4(%rsp) -; CHECK-NEXT: shrl $16, %ecx -; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: movb %cl, -2(%rsp) -; CHECK-NEXT: movb %al, -5(%rsp) -; CHECK-NEXT: cmpb $31, %al +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll $15, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: movw %ax, -4(%rsp) +; CHECK-NEXT: movb %dil, -5(%rsp) +; CHECK-NEXT: cmpb $31, %dil ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %Then ; CHECK-NEXT: int3 diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -517,21 +517,20 @@ ; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: movzbl %al, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl %ecx -; AVX512F-NEXT: andl $-43, %ecx -; AVX512F-NEXT: subl %ecx, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512F-NEXT: shrl $2, %eax +; AVX512F-NEXT: movzbl %al, %ecx +; AVX512F-NEXT: shrl %eax +; AVX512F-NEXT: andl $85, %eax +; AVX512F-NEXT: subl %eax, %ecx +; AVX512F-NEXT: movl %ecx, %eax ; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512F-NEXT: addl %ecx, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $4, %ecx +; AVX512F-NEXT: shrl $2, %ecx +; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333 ; AVX512F-NEXT: addl %eax, %ecx -; AVX512F-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512F-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $4, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F +; AVX512F-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 ; AVX512F-NEXT: shrl $24, %eax ; AVX512F-NEXT: kshiftrw $8, %k1, %k2 ; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} @@ -571,21 +570,20 @@ ; AVX512VLBW-NEXT: vpsllw $7, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovb2m %xmm2, %k1 ; AVX512VLBW-NEXT: kmovd %k1, %eax -; AVX512VLBW-NEXT: movzbl %al, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: shrl %ecx -; AVX512VLBW-NEXT: andl $-43, %ecx -; AVX512VLBW-NEXT: subl %ecx, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512VLBW-NEXT: shrl $2, %eax +; AVX512VLBW-NEXT: movzbl %al, %ecx +; AVX512VLBW-NEXT: shrl %eax +; AVX512VLBW-NEXT: andl $85, %eax +; AVX512VLBW-NEXT: subl %eax, %ecx +; AVX512VLBW-NEXT: movl %ecx, %eax ; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512VLBW-NEXT: addl %ecx, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: shrl $4, %ecx +; AVX512VLBW-NEXT: shrl $2, %ecx +; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333 ; AVX512VLBW-NEXT: addl %eax, %ecx -; AVX512VLBW-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512VLBW-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; AVX512VLBW-NEXT: movl %ecx, %eax +; AVX512VLBW-NEXT: shrl $4, %eax +; AVX512VLBW-NEXT: addl %ecx, %eax +; AVX512VLBW-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F +; AVX512VLBW-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 ; AVX512VLBW-NEXT: shrl $24, %eax ; AVX512VLBW-NEXT: kshiftrw $8, %k1, %k2 ; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -57,15 +57,15 @@ ; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: cmovll %ecx, %edx ; CHECK-NEXT: pextrw $1, %xmm0, %esi -; CHECK-NEXT: movswl %si, %edi -; CHECK-NEXT: leal (%rdi,%rdi), %eax +; CHECK-NEXT: leal (%rsi,%rsi), %edi +; CHECK-NEXT: movswl %si, %eax ; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: shrl $16, %esi -; CHECK-NEXT: shldw $1, %ax, %si -; CHECK-NEXT: sarl $15, %edi -; CHECK-NEXT: cmpl $16384, %edi # imm = 0x4000 +; CHECK-NEXT: shldw $1, %di, %si +; CHECK-NEXT: sarl $16, %eax +; CHECK-NEXT: cmpl $16384, %eax # imm = 0x4000 ; CHECK-NEXT: cmovgel %r8d, %esi -; CHECK-NEXT: cmpl $-16384, %edi # imm = 0xC000 +; CHECK-NEXT: cmpl $-16384, %eax # imm = 0xC000 ; CHECK-NEXT: cmovll %ecx, %esi ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: cwtl