diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2429,20 +2429,36 @@ // COPY is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < - (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))), + (v2i16 (UniformBinFrag (i16 0), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i16 16)) >; def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))), + (v2i16 (DivergentBinFrag (i16 0), (i16 SReg_32:$src1))), + (v2i16 (V_LSHLREV_B32_e64 (i16 16), SReg_32:$src1)) +>; + + +def : GCNPat < + (v2i16 (UniformBinFrag (i16 SReg_32:$src1), (i16 0))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; def : GCNPat < - (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2i16 (DivergentBinFrag (i16 SReg_32:$src1), (i16 0))), + (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1)) +>; + +def : GCNPat < + (v2f16 (UniformBinFrag (f16 SReg_32:$src1), (f16 FP_ZERO))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; +def : GCNPat < + (v2f16 (DivergentBinFrag (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1)) +>; + def : GCNPat < (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))), (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) @@ -2459,41 +2475,74 @@ >; def : GCNPat < - (v2i16 (build_vector (i16 undef), (i16 SReg_32:$src1))), + (v2i16 (UniformBinFrag (i16 undef), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i32 16)) >; def : GCNPat < - (v2f16 (build_vector (f16 undef), (f16 SReg_32:$src1))), + (v2i16 (DivergentBinFrag (i16 undef), (i16 SReg_32:$src1))), + (v2i16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) +>; + + +def : GCNPat < + (v2f16 (UniformBinFrag (f16 undef), (f16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i32 16)) >; +def : GCNPat < + (v2f16 (DivergentBinFrag (f16 undef), (f16 SReg_32:$src1))), + (v2f16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) +>; + let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src0), (i16 SReg_32:$src1))), + (v2i16 (UniformBinFrag (i16 SReg_32:$src0), (i16 SReg_32:$src1))), (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; +def : GCNPat < + (v2i16 (DivergentBinFrag (i16 SReg_32:$src0), (i16 SReg_32:$src1))), + (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) +>; + // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (v2i16 (UniformBinFrag (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1)) >; +def : GCNPat < + (v2i16 (DivergentBinFrag (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (v2i16 (V_BFI_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0, SReg_32:$src1)) +>; + def : GCNPat < - (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), + (v2i16 (UniformBinFrag (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) >; +def : GCNPat < + (v2i16 (DivergentBinFrag (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), + (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (v2i16 (V_AND_OR_B32_e64 SReg_32:$src1, (i32 (V_MOV_B32_e32 (i32 0xffff0000))), (i32 (V_LSHRREV_B32_e64 (i32 16), SReg_32:$src0)))) +>; + // TODO: Should source modifiers be matched to v_pack_b32_f16? def : GCNPat < - (v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))), + (v2f16 (UniformBinFrag (f16 SReg_32:$src0), (f16 SReg_32:$src1))), (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; +def : GCNPat < + (v2f16 (DivergentBinFrag (f16 SReg_32:$src0), (f16 SReg_32:$src1))), + (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0)))) +>; + + def : GCNPat < (v2f16 (is_canonicalized (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll new file mode 100755 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -0,0 +1,215 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX9 %s + +; GCN-LABEL: name: uniform_vec_0_i16 +; GCN: S_LSHL_B32 +define amdgpu_kernel void @uniform_vec_0_i16(i32 addrspace(1)* %out, i16 %a) { + %tmp = insertelement <2 x i16> undef, i16 0, i32 0 + %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 + %val = bitcast <2 x i16> %vec to i32 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: name: divergent_vec_0_i16 +; GCN: V_LSHLREV_B32_e64 +define i32 @divergent_vec_0_i16(i16 %a) { + %tmp = insertelement <2 x i16> undef, i16 0, i32 0 + %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 + %val = bitcast <2 x i16> %vec to i32 + ret i32 %val +} + +; GCN-LABEL: name: uniform_vec_i16_0 +; GCN: S_AND_B32 +define amdgpu_kernel void @uniform_vec_i16_0(i32 addrspace(1)* %out, i16 %a) { + %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 + %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 + %val = bitcast <2 x i16> %vec to i32 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: name: divergent_vec_i16_0 +; GCN: V_AND_B32_e64 +define i32 @divergent_vec_i16_0(i16 %a) { + %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 + %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 + %val = bitcast <2 x i16> %vec to i32 + ret i32 %val +} + +; GCN-LABEL: name: uniform_vec_f16_0 +; GCN: S_AND_B32 +define amdgpu_kernel void @uniform_vec_f16_0(float addrspace(1)* %out, half %a) { + %tmp = insertelement <2 x half> undef, half %a, i32 0 + %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 + %val = bitcast <2 x half> %vec to float + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: name: divergent_vec_f16_0 +; GCN: V_CVT_F16_F32_e64 0, %0 +; GCN: COPY %1 + +; GFX9-LABEL: name: divergent_vec_f16_0 +; GFX9: V_AND_B32_e64 +define float @divergent_vec_f16_0(half %a) { + %tmp = insertelement <2 x half> undef, half %a, i32 0 + %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 + %val = bitcast <2 x half> %vec to float + ret float %val +} + +; GCN-LABEL: name: uniform_vec_i16_LL +; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 +; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]] +; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 +; GCN: %[[SHL:[0-9]+]]:sreg_32 = S_LSHL_B32 killed %{{[0-9]+}}, killed %[[SHIFT]] +; GCN: S_OR_B32 killed %[[AND]], killed %[[SHL]] + +; GFX9-LABEL: name: uniform_vec_i16_LL +; GFX9: S_PACK_LL_B32_B16 +define amdgpu_kernel void @uniform_vec_i16_LL(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) { + %val0 = load volatile i32, i32 addrspace(4)* %in0 + %val1 = load volatile i32, i32 addrspace(4)* %in1 + %lo = trunc i32 %val0 to i16 + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: name: divergent_vec_i16_LL +; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 +; GCN: %[[SHL:[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed %[[SHIFT]], %1, implicit $exec +; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 +; GCN: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 %0, killed %[[IMM]], implicit $exec +; GCN: V_OR_B32_e64 killed %[[AND]], killed %[[SHL]], implicit $exec + +; GFX9-LABEL: name: divergent_vec_i16_LL +; GFX9: %[[IMM:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535 +; GFX9: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed %[[IMM]] +; GFX9: V_LSHL_OR_B32_e64 %{{[0-9]+}}, 16, killed %[[AND]] +define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { + %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 + %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1 + %val = bitcast <2 x i16> %vec to i32 + ret i32 %val +} + +; GCN-LABEL: name: uniform_vec_i16_LH +; GCN-DAG: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 +; GCN-DAG: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]] +; GCN-DAG: %[[NEG:[0-9]+]]:sreg_32 = S_MOV_B32 -65536 +; GCN-DAG: %[[ANDN:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[NEG]] +; GCN: S_OR_B32 killed %[[AND]], killed %[[ANDN]] + +; GFX9-LABEL: name: uniform_vec_i16_LH +; GFX9: S_PACK_LH_B32_B16 +define amdgpu_kernel void @uniform_vec_i16_LH(i32 addrspace(1)* %out, i16 %a, i32 %b) { + %shift = lshr i32 %b, 16 + %tr = trunc i32 %shift to i16 + %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 + %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1 + %val = bitcast <2 x i16> %vec to i32 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: name: divergent_vec_i16_LH +; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 +; GCN: V_BFI_B32_e64 killed %[[IMM]] +define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { + %shift = lshr i32 %b, 16 + %tr = trunc i32 %shift to i16 + %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 + %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1 + %val = bitcast <2 x i16> %vec to i32 + ret i32 %val +} + +; GCN-LABEL: name: uniform_vec_i16_HH +; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 +; GCN: %[[SHR:[0-9]+]]:sreg_32 = S_LSHR_B32 killed %{{[0-9]+}}, killed %[[SHIFT]] +; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 -65536 +; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]] +; GCN: S_OR_B32 killed %[[SHR]], killed %[[AND]] + +; GFX9-LABEL: name: uniform_vec_i16_HH +; GFX9: S_PACK_HH_B32_B16 +define amdgpu_kernel void @uniform_vec_i16_HH(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %shift_a = lshr i32 %a, 16 + %tr_a = trunc i32 %shift_a to i16 + %shift_b = lshr i32 %b, 16 + %tr_b = trunc i32 %shift_b to i16 + %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0 + %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1 + %val = bitcast <2 x i16> %vec to i32 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: name: divergent_vec_i16_HH +; GCN: %[[SHR:[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed %{{[0-9]+}}, %0, implicit $exec +; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 -65536 +; GCN: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 %1, killed %[[IMM]], implicit $exec +; GCN: V_OR_B32_e64 killed %[[SHR]], killed %[[AND]], implicit $exec + +; GFX9-LABEL: name: divergent_vec_i16_HH +; GFX9: %[[SHR:[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, %0 +; GFX9: %[[IMM:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -65536, implicit $exec +; GFX9: V_AND_OR_B32_e64 %1, killed %[[IMM]], killed %[[SHR]] +define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { + %shift_a = lshr i32 %a, 16 + %tr_a = trunc i32 %shift_a to i16 + %shift_b = lshr i32 %b, 16 + %tr_b = trunc i32 %shift_b to i16 + %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0 + %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1 + %val = bitcast <2 x i16> %vec to i32 + ret i32 %val +} + +; GCN-LABEL: name: uniform_vec_f16_LL +; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 +; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]] +; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 +; GCN: %[[SHL:[0-9]+]]:sreg_32 = S_LSHL_B32 killed %{{[0-9]+}}, killed %[[SHIFT]] +; GCN: S_OR_B32 killed %[[AND]], killed %[[SHL]] + +; GFX9-LABEL: name: uniform_vec_f16_LL +; GFX9: S_PACK_LL_B32_B16 +define amdgpu_kernel void @uniform_vec_f16_LL(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) { + %val0 = load volatile i32, i32 addrspace(4)* %in0 + %val1 = load volatile i32, i32 addrspace(4)* %in1 + %lo.i = trunc i32 %val0 to i16 + %hi.i = trunc i32 %val1 to i16 + %lo = bitcast i16 %lo.i to half + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: name: divergent_vec_f16_LL +; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 +; GCN: %[[SHL:[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed %[[SHIFT]] +; GCN: V_OR_B32_e64 killed %{{[0-9]+}}, killed %[[SHL]], implicit $exec + +; GFX9-LABEL: name: divergent_vec_f16_LL +; GFX9: %[[IMM:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535 +; GFX9: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed %[[IMM]] +; GFX9: V_LSHL_OR_B32_e64 %{{[0-9]+}}, 16, killed %[[AND]] +define float @divergent_vec_f16_LL(half %a, half %b) { + %tmp = insertelement <2 x half> undef, half %a, i32 0 + %vec = insertelement <2 x half> %tmp, half %b, i32 1 + %val = bitcast <2 x half> %vec to float + ret float %val +} diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2362,46 +2362,46 @@ ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v11, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v18, 15, v2 -; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 28, v2 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v11, v4, v11 -; GFX9-NEXT: v_and_b32_e32 v18, v4, v18 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v18 -; GFX9-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX9-NEXT: v_and_b32_e32 v5, v4, v5 -; GFX9-NEXT: v_and_b32_e32 v16, v4, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX9-NEXT: v_and_b32_e32 v6, v4, v6 +; GFX9-NEXT: v_and_b32_e32 v17, v4, v17 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_lshl_or_b32 v9, v10, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX9-NEXT: v_and_b32_e32 v14, v4, v14 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v12 +; GFX9-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX9-NEXT: v_and_b32_e32 v15, v4, v15 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v13 ; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; GFX9-NEXT: v_lshl_or_b32 v4, v13, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v8, v14, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v4, v12, 16, v4 ; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2445,46 +2445,46 @@ ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v18, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v11, v4, v11 -; GFX9-DL-NEXT: v_and_b32_e32 v18, v4, v18 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v18 -; GFX9-DL-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX9-DL-NEXT: v_and_b32_e32 v5, v4, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v16, v4, v16 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v6, v4, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v10, 16, v9 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v17, 16, v16 +; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX9-DL-NEXT: v_and_b32_e32 v14, v4, v14 -; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v13 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v13, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v14, 16, v15 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v12, 16, v4 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -2529,57 +2529,57 @@ ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v6, v1, 24, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 20, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v10, v1, 8, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v11, v1, 4, 4 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v10, v1, 12, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v16, v2, 4, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, v4, v11 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v14, v2, 24, 4 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v4, v1 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v18, v2, 8, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 16, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v18, v2, 12, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v2, v2, 8, 4 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 20, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v2, v2, 12, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v18 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v4, v2 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v2, 16, v13 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v18, 16, v2 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v15 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v17 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v5 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v12 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v14 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v12, 16, v4 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v3 op_sel_hi:[0,1] @@ -2617,57 +2617,57 @@ ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v6, v1, 24, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 20, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v10, v1, 8, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v11, v1, 4, 4 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 15, v0 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v10, v1, 12, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v16, v0, 4, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, v4, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v14, v0, 24, 4 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v4, v1 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v18, v0, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v12, v0, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 20, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 16, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v18, v0, 12, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v0, v0, 8, 4 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 20, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v0, v0, 12, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v18 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v4, v0 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v0, 16, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v18, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v15 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v17 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v5 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v12 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v14 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v12, 16, v4 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1] diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -179,13 +179,13 @@ ; GFX9-LABEL: shuffle_v4f16_35u5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[2:3], off ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -194,13 +194,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -214,13 +214,13 @@ ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -229,13 +229,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -803,8 +803,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -817,11 +817,11 @@ ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -985,13 +985,13 @@ ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -999,13 +999,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0