Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -104,6 +104,12 @@ [{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]), (apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>; +def build_vector_trunc_to_bitcast : GICombineRule< + (defs root:$build_vector_trunc), + (match (wip_match_opcode G_BUILD_VECTOR_TRUNC):$build_vector_trunc, + [{ return PostLegalizerHelper.matchBuildVectorTruncToBitcast(*${build_vector_trunc}); }]), + (apply [{ PostLegalizerHelper.applyBuildVectorTruncToBitcast(*${build_vector_trunc}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -119,7 +125,7 @@ "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq]> { + rcp_sqrt_to_rsq, build_vector_trunc_to_bitcast]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -74,6 +75,9 @@ const CvtF32UByteMatchInfo &MatchInfo); bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); + + bool matchBuildVectorTruncToBitcast(MachineInstr &MI); + void applyBuildVectorTruncToBitcast(MachineInstr &MI); }; bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( @@ -303,6 +307,38 @@ return TLI->isCanonicalized(Reg, MF); } +// %a:_(s2N) = ... +// %b:_(s2N) = G_IMPLICIT_DEF +// %c:_(<2 x sN>) = G_BUILD_VECTOR_TRUNC %a:_(s2N), %b +// +// %c:_(<2 x sN>) = G_BITCAST %a:_(s2N) +bool AMDGPUPostLegalizerCombinerHelper::matchBuildVectorTruncToBitcast( + MachineInstr &MI) { + + if (MI.getNumOperands() != 3) + return false; + + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (DstTy.getSizeInBits() != SrcTy.getSizeInBits()) + return false; + + MachineInstr *SecondElt = + getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); + if (!SecondElt || !isa(SecondElt)) + return false; + + return true; +} + +void AMDGPUPostLegalizerCombinerHelper::applyBuildVectorTruncToBitcast( + MachineInstr &MI) { + B.setInstrAndDebugLoc(MI); + + B.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); + MI.eraseFromParent(); +} + class AMDGPUPostLegalizerCombinerHelperState { protected: AMDGPUCombinerHelper &Helper; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll @@ -46,24 +46,20 @@ ; GFX9-NEXT: global_load_ushort v10, v[0:1], off ; GFX9-NEXT: global_load_ushort v11, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_and_or_b32 v2, v7, v0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_or_b32 v6, v9, v0, s4 +; GFX9-NEXT: v_pk_add_u16 v3, v7, v9 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_or_b32 v1, v10, v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v0, v11, v0, v3 +; GFX9-NEXT: v_and_or_b32 v0, v11, v0, v2 ; GFX9-NEXT: v_pk_add_u16 v0, v1, v0 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 ; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2 -; GFX9-NEXT: global_store_short v[4:5], v2, off offset:4 +; GFX9-NEXT: global_store_short v[4:5], v3, off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %a = load <3 x i16>, <3 x i16> addrspace(1)* %ptra, align 4 @@ -86,13 +82,8 @@ ; GFX9-LABEL: addv3i16arg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, s4 -; GFX9-NEXT: v_and_or_b32 v3, v3, v4, s4 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, s4 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add <3 x i16> %a, %b ret <3 x i16> %add @@ -225,35 +216,31 @@ ; GFX9-NEXT: global_load_ushort v14, v[2:3], off ; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_and_or_b32 v3, v8, v0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_and_or_b32 v8, v11, v0, s4 +; GFX9-NEXT: v_pk_add_u16 v7, v8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_and_or_b32 v1, v12, v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_and_or_b32 v2, v13, v0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_or_b32 v6, v14, v0, v6 +; GFX9-NEXT: v_and_or_b32 v3, v14, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v0, v15, v0, v7 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v6 -; GFX9-NEXT: v_pk_add_u16 v3, v3, v8 +; GFX9-NEXT: v_and_or_b32 v0, v15, v0, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_add_u16 v0, v2, v0 ; GFX9-NEXT: global_store_short v[4:5], v1, off ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:2 ; GFX9-NEXT: global_store_short v[4:5], v0, off offset:4 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:6 -; GFX9-NEXT: global_store_short v[4:5], v3, off offset:8 +; GFX9-NEXT: global_store_short v[4:5], v7, off offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %a = load <5 x i16>, <5 x i16> addrspace(1)* %ptra, align 4 @@ -279,14 +266,9 @@ ; GFX9-LABEL: addv5i16arg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-NEXT: v_and_or_b32 v2, v2, v6, s4 -; GFX9-NEXT: v_and_or_b32 v5, v5, v6, s4 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_and_or_b32 v2, v2, v6, s4 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add <5 x i16> %a, %b ret <5 x i16> %add @@ -453,23 +435,20 @@ ; GFX9-NEXT: global_load_ushort v18, v[2:3], off offset:4 ; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_and_or_b32 v6, v9, v0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v11 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_and_or_b32 v10, v13, v0, s4 +; GFX9-NEXT: v_pk_add_u16 v9, v9, v13 ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_and_or_b32 v1, v14, v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) @@ -477,14 +456,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_and_or_b32 v3, v16, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_or_b32 v7, v17, v0, v7 +; GFX9-NEXT: v_and_or_b32 v6, v17, v0, v6 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_or_b32 v8, v18, v0, v8 +; GFX9-NEXT: v_and_or_b32 v7, v18, v0, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v0, v19, v0, v9 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v7 -; GFX9-NEXT: v_pk_add_u16 v6, v6, v10 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v8 +; GFX9-NEXT: v_and_or_b32 v0, v19, v0, v8 +; GFX9-NEXT: v_pk_add_u16 v1, v1, v6 +; GFX9-NEXT: v_pk_add_u16 v2, v2, v7 ; GFX9-NEXT: v_pk_add_u16 v0, v3, v0 ; GFX9-NEXT: global_store_short v[4:5], v1, off ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:2 @@ -492,7 +470,7 @@ ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v2, off offset:6 ; GFX9-NEXT: global_store_short v[4:5], v0, off offset:8 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:10 -; GFX9-NEXT: global_store_short v[4:5], v6, off offset:12 +; GFX9-NEXT: global_store_short v[4:5], v9, off offset:12 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %a = load <7 x i16>, <7 x i16> addrspace(1)* %ptra, align 4 @@ -521,15 +499,10 @@ ; GFX9-LABEL: addv7i16arg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX9-NEXT: v_and_or_b32 v7, v7, v8, s4 -; GFX9-NEXT: v_pk_add_u16 v3, v3, v7 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v5 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 -; GFX9-NEXT: v_and_or_b32 v3, v3, v8, s4 +; GFX9-NEXT: v_pk_add_u16 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add <7 x i16> %a, %b ret <7 x i16> %add @@ -572,21 +545,16 @@ ; GFX9-LABEL: addv9i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off ; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16 ; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16 -; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_or_b32 v14, v14, v0, s4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_or_b32 v15, v15, v0, s4 +; GFX9-NEXT: v_pk_add_u16 v0, v10, v6 +; GFX9-NEXT: v_pk_add_u16 v1, v11, v7 +; GFX9-NEXT: v_pk_add_u16 v2, v12, v8 +; GFX9-NEXT: v_pk_add_u16 v3, v13, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v0, v6, v10 -; GFX9-NEXT: v_pk_add_u16 v1, v7, v11 -; GFX9-NEXT: v_pk_add_u16 v2, v8, v12 -; GFX9-NEXT: v_pk_add_u16 v3, v9, v13 ; GFX9-NEXT: v_pk_add_u16 v6, v14, v15 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: global_store_short v[4:5], v6, off offset:16 @@ -621,16 +589,11 @@ ; GFX9-LABEL: addv9i16arg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v10, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-NEXT: v_and_or_b32 v4, v4, v10, s4 -; GFX9-NEXT: v_and_or_b32 v9, v9, v10, s4 -; GFX9-NEXT: v_pk_add_u16 v4, v4, v9 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v5 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v6 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v7 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v8 -; GFX9-NEXT: v_and_or_b32 v4, v4, v10, s4 +; GFX9-NEXT: v_pk_add_u16 v4, v4, v9 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add <9 x i16> %a, %b ret <9 x i16> %add @@ -764,30 +727,26 @@ ; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:20 ; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:16 ; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:16 -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_pk_add_u16 v0, v6, v10 ; GFX9-NEXT: v_pk_add_u16 v1, v7, v11 -; GFX9-NEXT: v_pk_add_u16 v3, v9, v13 +; GFX9-NEXT: v_pk_add_u16 v2, v8, v12 ; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v14 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v16 -; GFX9-NEXT: v_pk_add_u16 v2, v8, v12 -; GFX9-NEXT: v_and_or_b32 v8, v15, v6, s4 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_or_b32 v10, v17, v6, s4 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; GFX9-NEXT: v_pk_add_u16 v3, v9, v13 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_or_b32 v7, v18, v6, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v6, v19, v6, v9 +; GFX9-NEXT: v_and_or_b32 v6, v19, v6, v8 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off -; GFX9-NEXT: v_pk_add_u16 v8, v8, v10 +; GFX9-NEXT: v_pk_add_u16 v9, v15, v17 ; GFX9-NEXT: v_pk_add_u16 v0, v7, v6 ; GFX9-NEXT: global_store_short v[4:5], v0, off offset:16 ; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:18 -; GFX9-NEXT: global_store_short v[4:5], v8, off offset:20 +; GFX9-NEXT: global_store_short v[4:5], v9, off offset:20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %a = load <11 x i16>, <11 x i16> addrspace(1)* %ptra, align 4 @@ -822,17 +781,12 @@ ; GFX9-LABEL: addv11i16arg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v12, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-NEXT: v_and_or_b32 v5, v5, v12, s4 -; GFX9-NEXT: v_and_or_b32 v11, v11, v12, s4 -; GFX9-NEXT: v_pk_add_u16 v5, v5, v11 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v6 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v7 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v8 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v9 ; GFX9-NEXT: v_pk_add_u16 v4, v4, v10 -; GFX9-NEXT: v_and_or_b32 v5, v5, v12, s4 +; GFX9-NEXT: v_pk_add_u16 v5, v5, v11 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add <11 x i16> %a, %b ret <11 x i16> %add Index: llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll @@ -59,7 +59,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp +; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_f16 v1, 0, 0 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX10-NEXT: v_pk_max_f16 v1, 1.0, 1.0 +; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fmul = fmul <2 x half> %a, %maxnum = call <2 x half> @llvm.maxnum.v2f16(<2 x half> , <2 x half> %fmul) @@ -114,7 +119,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp +; GFX10-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, 0 +; GFX10-NEXT: v_pk_min_f16 v0, 1.0, v0 +; GFX10-NEXT: v_pk_max_f16 v0, s4, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %fmul = fmul <2 x half> %a, %minnum = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> , <2 x half> %fmul) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir @@ -248,15 +248,16 @@ ; CHECK-NEXT: %two_splat:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %two_s32(s32), %two_s32(s32) ; CHECK-NEXT: %zero:_(s16) = G_FCONSTANT half 0xH0000 ; CHECK-NEXT: %zero_s32:_(s32) = G_ANYEXT %zero(s16) - ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: %zero_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %zero_s32(s32), %undef(s32) + ; CHECK-NEXT: %zero_undef:_(<2 x s16>) = G_BITCAST %zero_s32(s32) ; CHECK-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00 ; CHECK-NEXT: %one_s32:_(s32) = G_ANYEXT %one(s16) - ; CHECK-NEXT: %one_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %one_s32(s32), %undef(s32) + ; CHECK-NEXT: %one_undef:_(<2 x s16>) = G_BITCAST %one_s32(s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat + ; CHECK-NEXT: %zero_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %zero_undef ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[FMUL]] - ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FCANONICALIZE]] - ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %one_undef, [[FMAXNUM_IEEE]] + ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef_fcan, [[FCANONICALIZE]] + ; CHECK-NEXT: %one_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %one_undef + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %one_undef_fcan, [[FMAXNUM_IEEE]] ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %two:_(s16) = G_FCONSTANT half 0xH4000 @@ -299,16 +300,16 @@ ; CHECK-NEXT: %two_splat:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %two_s32(s32), %two_s32(s32) ; CHECK-NEXT: %snan:_(s16) = G_FCONSTANT half 0xH7C01 ; CHECK-NEXT: %snan_s32:_(s32) = G_ANYEXT %snan(s16) - ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: %snan_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %snan_s32(s32), %undef(s32) + ; CHECK-NEXT: %snan_undef:_(<2 x s16>) = G_BITCAST %snan_s32(s32) ; CHECK-NEXT: %qnan:_(s16) = G_FCONSTANT half 0xH7E01 ; CHECK-NEXT: %qnan_s32:_(s32) = G_ANYEXT %qnan(s16) - ; CHECK-NEXT: %qnan_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %qnan_s32(s32), %undef(s32) + ; CHECK-NEXT: %qnan_undef:_(<2 x s16>) = G_BITCAST %qnan_s32(s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat ; CHECK-NEXT: %snan_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %snan_undef ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[FMUL]] ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FCANONICALIZE]] - ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %qnan_undef, [[FMAXNUM_IEEE]] + ; CHECK-NEXT: %qnan_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %qnan_undef + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %qnan_undef_fcan, [[FMAXNUM_IEEE]] ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %two:_(s16) = G_FCONSTANT half 0xH4000 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -786,117 +786,69 @@ ; GFX9-LABEL: test_3xhalf_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-NEXT: v_and_or_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_and_or_b32 v3, v3, v6, s4 -; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: v_and_or_b32 v2, v5, v6, s4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_pk_add_f16 v1, v2, v1 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 -; GFX9-NEXT: v_and_or_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v6, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v3, v3, v6, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v5, v5, v6, s4 -; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v6, s4 +; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v6, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v3, v6, s4 -; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v2, v5, v6, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v6, s4 -; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v2, v1 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v6, s4 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-UNSAFE: ; %bb.0: ; %.entry ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v6, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v3, v3, v6, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v5, v5, v6, s4 -; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v6, s4 +; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_3xhalf_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 -; GFX10-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v5, s4 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-NEXT: v_pk_add_f16 v1, v3, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v5, 0xffff, v5, s4 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 -; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_and_or_b32 v3, 0xffff, v5, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v3, v1 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v5, 0xffff, v5, s4 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <3 x half> %x, %y Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -487,20 +487,18 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: atomic_add_i32_3d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 -; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 +; GFX9-NEXT: v_and_or_b32 v2, v1, v4, v2 +; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -508,17 +506,15 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v1, v2 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16 +; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -530,20 +526,18 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %face) { ; GFX9-LABEL: atomic_add_i32_cube: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 -; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da +; GFX9-NEXT: v_and_or_b32 v2, v1, v4, v2 +; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -551,17 +545,15 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v1, v2 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16 +; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -612,20 +604,18 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %slice) { ; GFX9-LABEL: atomic_add_i32_2darray: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 -; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da +; GFX9-NEXT: v_and_or_b32 v2, v1, v4, v2 +; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -633,17 +623,15 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v1, v2 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 +; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -655,20 +643,18 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %fragid) { ; GFX9-LABEL: atomic_add_i32_2dmsaa: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 -; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 +; GFX9-NEXT: v_and_or_b32 v2, v1, v4, v2 +; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -676,17 +662,15 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v1, v2 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 +; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1256,20 +1240,18 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: atomic_add_i64_3d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 -; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 +; GFX9-NEXT: v_and_or_b32 v3, v2, v5, v3 +; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -1277,17 +1259,15 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v2, v3 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16 +; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1299,20 +1279,18 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %face) { ; GFX9-LABEL: atomic_add_i64_cube: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 -; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da +; GFX9-NEXT: v_and_or_b32 v3, v2, v5, v3 +; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -1320,17 +1298,15 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v2, v3 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16 +; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1381,20 +1357,18 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %slice) { ; GFX9-LABEL: atomic_add_i64_2darray: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 -; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da +; GFX9-NEXT: v_and_or_b32 v3, v2, v5, v3 +; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -1402,17 +1376,15 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v2, v3 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 +; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1424,20 +1396,18 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %fragid) { ; GFX9-LABEL: atomic_add_i64_2dmsaa: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 -; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 +; GFX9-NEXT: v_and_or_b32 v3, v2, v5, v3 +; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -1445,17 +1415,15 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v2, v3 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 +; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -61,24 +61,22 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: v_and_or_b32 v1, v0, v3, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da +; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -88,22 +86,20 @@ ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v0, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -117,24 +113,22 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: v_and_or_b32 v1, v0, v3, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da +; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -144,22 +138,20 @@ ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v0, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -225,24 +217,22 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: v_and_or_b32 v1, v0, v3, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -252,22 +242,20 @@ ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v0, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -281,24 +269,24 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX9-NEXT: v_and_or_b32 v2, v4, v0, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: image_gather4_c_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -308,22 +296,20 @@ ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -337,21 +323,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, s12 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 @@ -364,19 +348,17 @@ ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 ; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 @@ -393,21 +375,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, s12 ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -420,19 +400,17 @@ ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 ; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 @@ -449,25 +427,24 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX9-NEXT: v_and_or_b32 v2, v4, v0, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: image_gather4_b_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -477,23 +454,20 @@ ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 ; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -507,23 +481,22 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v5, s12 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s12 +; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v5 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -535,23 +508,20 @@ ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 ; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10NSA-NEXT: v_and_or_b32 v3, 0xffff, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -562,24 +532,22 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_l_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX9-NEXT: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: v_and_or_b32 v1, v0, v3, v1 +; GFX9-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -587,21 +555,19 @@ ; GFX10NSA: ; %bb.0: ; %main_body ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v0, v1 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -612,24 +578,24 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_c_l_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 -; GFX9-NEXT: image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: v_and_or_b32 v2, v4, v0, v2 +; GFX9-NEXT: image_gather4_c_l v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -637,21 +603,19 @@ ; GFX10NSA: ; %bb.0: ; %main_body ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -542,44 +542,22 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: load_1d_v3f16_xyz: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s0 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: load_1d_v3f16_xyz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, s0 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: load_1d_v3f16_xyz: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) +; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v } @@ -963,3 +941,6 @@ declare { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 immarg, i32, <8 x i32>, i32 immarg, i32 immarg) #0 attributes #0 = { nounwind readonly } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -6,38 +6,34 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 -; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 +; GFX9-NEXT: v_and_or_b32 v1, v0, v3, v1 +; GFX9-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: load_3d_v4f32_xyzw: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10PLUS-NEXT: s_mov_b32 s0, s2 -; GFX10PLUS-NEXT: s_mov_b32 s2, s4 -; GFX10PLUS-NEXT: s_mov_b32 s4, s6 -; GFX10PLUS-NEXT: s_mov_b32 s6, s8 -; GFX10PLUS-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10PLUS-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 -; GFX10PLUS-NEXT: v_and_or_b32 v1, 0xffff, v2, s8 ; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 ; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: v_and_or_b32 v1, 0xffff, v0, v1 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 ; GFX10PLUS-NEXT: s_mov_b32 s7, s9 -; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX10PLUS-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -47,90 +43,86 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_and_or_b32 v5, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-NEXT: v_mov_b32_e32 v3, v8 -; GFX9-NEXT: v_mov_b32_e32 v4, v9 -; GFX9-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe +; GFX9-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm a16 tfe ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v5, v4, s[10:11] +; GFX9-NEXT: global_store_dword v7, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v8, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v0, v1 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: v_mov_b32_e32 v2, v7 -; GFX10-NEXT: v_mov_b32_e32 v3, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, v9 -; GFX10-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, v11 +; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v7, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_lshl_b32 s8, s0, 16 -; GFX11-NEXT: v_mov_b32_e32 v7, v5 -; GFX11-NEXT: v_mov_b32_e32 v9, v5 -; GFX11-NEXT: v_mov_b32_e32 v8, v5 -; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v0, v1 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v4, v9 -; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] +; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0) @@ -143,90 +135,86 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_and_or_b32 v5, v0, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-NEXT: v_mov_b32_e32 v3, v8 -; GFX9-NEXT: v_mov_b32_e32 v4, v9 -; GFX9-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe lwe +; GFX9-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm a16 tfe lwe ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v5, v4, s[10:11] +; GFX9-NEXT: global_store_dword v7, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v8, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v0, v1 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: v_mov_b32_e32 v2, v7 -; GFX10-NEXT: v_mov_b32_e32 v3, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, v9 -; GFX10-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, v11 +; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v7, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_lshl_b32 s8, s0, 16 -; GFX11-NEXT: v_mov_b32_e32 v7, v5 -; GFX11-NEXT: v_mov_b32_e32 v9, v5 -; GFX11-NEXT: v_mov_b32_e32 v8, v5 -; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v0, v1 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v4, v9 -; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] +; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll @@ -4,9 +4,6 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -33,9 +30,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -62,9 +56,6 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -91,9 +82,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -5,9 +5,6 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -34,15 +31,12 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v9, s12 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v4 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v5, s12 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -54,9 +48,6 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -83,9 +74,6 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -112,9 +100,6 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-build-vector-trunc.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-build-vector-trunc.mir @@ -0,0 +1,58 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck --check-prefix=GFX9 %s + +--- +name: undef_bitcast +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX9-LABEL: name: undef_bitcast + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[BITCAST]](<2 x s16>) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_IMPLICIT_DEF + %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + $vgpr1= COPY %2 +... + +--- +name: elt_0_undef +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX9-LABEL: name: elt_0_undef + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[DEF]](s32), [[COPY]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_IMPLICIT_DEF + %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %1, %0 + $vgpr1= COPY %2 +... + +--- +name: different_dst_and_src_size +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX9-LABEL: name: different_dst_and_src_size + ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s64), [[DEF]](s64) + ; GFX9-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_IMPLICIT_DEF + %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + $vgpr2= COPY %2 +... Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -13,8 +13,6 @@ ; ; GFX10GISEL-LABEL: sample_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -83,8 +81,6 @@ ; ; GFX10GISEL-LABEL: sample_c_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, s12 ; GFX10GISEL-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -222,8 +218,6 @@ ; ; GFX10GISEL-LABEL: sample_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -262,8 +256,6 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, s12 ; GFX10GISEL-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog