Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -88,6 +88,12 @@ [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>; +def remove_or_and_shl : GICombineRule< + (defs root:$rm_bitcast, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_BITCAST):$rm_bitcast, + [{ return RegBankHelper.matchCombineOrAndShl(*${rm_bitcast}, ${matchinfo}); }]), + (apply [{ Helper.replaceSingleDefInstWithReg(*${rm_bitcast}, ${matchinfo}); }])>; + def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; def remove_fcanonicalize : GICombineRule< @@ -128,7 +134,8 @@ def AMDGPURegBankCombinerHelper : GICombinerHelper< "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, - fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { + fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, + remove_or_and_shl]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; Index: llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -73,6 +73,7 @@ bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg); void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); void applyClamp(MachineInstr &MI, Register &Reg); + bool matchCombineOrAndShl(MachineInstr &MI, Register &NewReg); private: AMDGPU::SIModeRegisterDefaults getMode(); @@ -325,6 +326,50 @@ MI.eraseFromParent(); } +/// Combiner that removes the unnecessary expression +/// G_BITCAST (G_OR ((G_AND X, 65535), (G_SHL Y, 16))) +/// and uses X or Y instead, depending on what instructions +/// they are. +/// In case X and Y are independant, the combiner will do nothing. +/// If X or Y is G_IMPLICIT_DEF, the result will be the other non-implicit +/// register. The result of the previous expression is similar, but +/// instead of having the same value as X or Y, it only holds the higher +/// or lower 16 bits of the non-implicit register. +bool AMDGPURegBankCombinerHelper::matchCombineOrAndShl(MachineInstr &MI, + Register &NewReg) { + Register SrcReg = MI.getOperand(0).getReg(); + MachineInstr *ShlSrcMI, *AndSrcMI; + + if (mi_match( + SrcReg, MRI, + m_GBitcast(m_GOr(m_GAnd(m_MInstr(AndSrcMI), m_SpecificICst(0xffff)), + m_GShl(m_MInstr(ShlSrcMI), m_SpecificICst(16)))))) { + if (ShlSrcMI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF && + AndSrcMI->getOpcode() == TargetOpcode::G_BITCAST) { + NewReg = AndSrcMI->getOperand(1).getReg(); + return true; + } else if (AndSrcMI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF && + ShlSrcMI->getOpcode() == TargetOpcode::G_BITCAST) { + NewReg = ShlSrcMI->getOperand(1).getReg(); + return true; + } else if (ShlSrcMI->getOpcode() == TargetOpcode::G_LSHR && + ShlSrcMI->getOperand(1).getReg() == + AndSrcMI->getOperand(0).getReg()) { + MachineInstr *ShlCstArg = + getDefIgnoringCopies(ShlSrcMI->getOperand(2).getReg(), MRI); + + if (ShlCstArg->getOpcode() == TargetOpcode::G_CONSTANT && + ShlCstArg->getOperand(1).isCImm() && + ShlCstArg->getOperand(1).getCImm()->equalsInt(16)) { + NewReg = AndSrcMI->getOperand(1).getReg(); + return true; + } + } + } + + return false; +} + AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { return MF.getInfo()->getMode(); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -786,229 +786,69 @@ ; GFX9-LABEL: test_3xhalf_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_pk_add_f16 v1, v4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX9-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v5, v5, v9, s4 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-DENORM-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-DENORM-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v4, v1 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-UNSAFE: ; %bb.0: ; %.entry ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v5, v5, v9, s4 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_3xhalf_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_and_or_b32 v2, v2, v8, v7 -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-NEXT: v_and_or_b32 v2, v5, v8, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v8 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v2, v8, v7 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v5, v8, s4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v8 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <3 x half> %x, %y Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-and-shl.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-and-shl.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_vs <3 x half> @test_v3f16(<3 x half> %x, <3 x half> %y) { +; GFX9-LABEL: test_v3f16: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_v3f16: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_add_f16 v1, v1, v3 +; GFX10-NEXT: ; return to shader part epilog +.entry: + %a = fadd <3 x half> %x, %y + ret <3 x half> %a +} + +define amdgpu_vs <9 x half> @test_v9f16(<9 x half> %x, <9 x half> %y) { +; GFX9-LABEL: test_v9f16: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_pk_add_f16 v0, v0, v5 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v6 +; GFX9-NEXT: v_pk_add_f16 v2, v2, v7 +; GFX9-NEXT: v_pk_add_f16 v3, v3, v8 +; GFX9-NEXT: v_pk_add_f16 v4, v4, v9 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_v9f16: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_pk_add_f16 v0, v0, v5 +; GFX10-NEXT: v_pk_add_f16 v1, v1, v6 +; GFX10-NEXT: v_pk_add_f16 v2, v2, v7 +; GFX10-NEXT: v_pk_add_f16 v3, v3, v8 +; GFX10-NEXT: v_pk_add_f16 v4, v4, v9 +; GFX10-NEXT: ; return to shader part epilog +.entry: + %a = fadd <9 x half> %x, %y + ret <9 x half> %a +} + +define amdgpu_vs <11 x half> @test_v11f16(<11 x half> %x, <11 x half> %y) { +; GFX9-LABEL: test_v11f16: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_pk_add_f16 v0, v0, v6 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v7 +; GFX9-NEXT: v_pk_add_f16 v2, v2, v8 +; GFX9-NEXT: v_pk_add_f16 v3, v3, v9 +; GFX9-NEXT: v_pk_add_f16 v4, v4, v10 +; GFX9-NEXT: v_pk_add_f16 v5, v5, v11 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_v11f16: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_pk_add_f16 v0, v0, v6 +; GFX10-NEXT: v_pk_add_f16 v1, v1, v7 +; GFX10-NEXT: v_pk_add_f16 v2, v2, v8 +; GFX10-NEXT: v_pk_add_f16 v3, v3, v9 +; GFX10-NEXT: v_pk_add_f16 v4, v4, v10 +; GFX10-NEXT: v_pk_add_f16 v5, v5, v11 +; GFX10-NEXT: ; return to shader part epilog +.entry: + %a = fadd <11 x half> %x, %y + ret <11 x half> %a +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-and-shl.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-and-shl.mir @@ -0,0 +1,568 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s + +--- +name: test_v3f16 +alignment: 1 +legalized: true +regBankSelected: true +selected: false +failedISel: false +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; GFX9-LABEL: name: test_v3f16 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[FADD:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY]], [[COPY2]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY1]], [[COPY3]] + ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[FADD1]](<2 x s16>) + ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + ; GFX10-LABEL: name: test_v3f16 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr3 + ; GFX10-NEXT: [[FADD:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY]], [[COPY2]] + ; GFX10-NEXT: [[FADD1:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY1]], [[COPY3]] + ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](<2 x s16>) + ; GFX10-NEXT: $vgpr1 = COPY [[FADD1]](<2 x s16>) + ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %2:vgpr(<2 x s16>) = COPY $vgpr0 + %3:vgpr(<2 x s16>) = COPY $vgpr1 + %44:vgpr(s32) = G_BITCAST %2(<2 x s16>) + %45:sgpr(s32) = G_CONSTANT i32 16 + %67:vgpr(s32) = COPY %45(s32) + %46:vgpr(s32) = G_LSHR %44, %67(s32) + %47:vgpr(s32) = G_BITCAST %3(<2 x s16>) + %9:vgpr(<2 x s16>) = COPY $vgpr2 + %10:vgpr(<2 x s16>) = COPY $vgpr3 + %49:vgpr(s32) = G_BITCAST %9(<2 x s16>) + %68:vgpr(s32) = COPY %45(s32) + %50:vgpr(s32) = G_LSHR %49, %68(s32) + %51:vgpr(s32) = G_BITCAST %10(<2 x s16>) + %69:vgpr(s32) = G_CONSTANT i32 65535 + %70:vgpr(s32) = G_CONSTANT i32 16 + %71:vgpr(s32) = G_SHL %46, %70(s32) + %72:vgpr(s32) = G_AND %44, %69 + %73:vgpr(s32) = G_OR %72, %71 + %38:vgpr(<2 x s16>) = G_BITCAST %73(s32) + %56:sgpr(s32) = G_IMPLICIT_DEF + %74:vgpr(s32) = G_CONSTANT i32 65535 + %75:sgpr(s32) = G_CONSTANT i32 16 + %76:sgpr(s32) = G_SHL %56, %75(s32) + %77:vgpr(s32) = G_AND %47, %74 + %78:vgpr(s32) = G_OR %77, %76 + %39:vgpr(<2 x s16>) = G_BITCAST %78(s32) + %79:vgpr(s32) = G_CONSTANT i32 65535 + %80:vgpr(s32) = G_CONSTANT i32 16 + %81:vgpr(s32) = G_SHL %50, %80(s32) + %82:vgpr(s32) = G_AND %49, %79 + %83:vgpr(s32) = G_OR %82, %81 + %40:vgpr(<2 x s16>) = G_BITCAST %83(s32) + %84:vgpr(s32) = G_CONSTANT i32 65535 + %85:sgpr(s32) = G_CONSTANT i32 16 + %86:sgpr(s32) = G_SHL %56, %85(s32) + %87:vgpr(s32) = G_AND %51, %84 + %88:vgpr(s32) = G_OR %87, %86 + %41:vgpr(<2 x s16>) = G_BITCAST %88(s32) + %42:vgpr(<2 x s16>) = G_FADD %38, %40 + %43:vgpr(<2 x s16>) = G_FADD %39, %41 + %57:vgpr(s32) = G_BITCAST %42(<2 x s16>) + %89:vgpr(s32) = COPY %45(s32) + %58:vgpr(s32) = G_LSHR %57, %89(s32) + %59:vgpr(s32) = G_BITCAST %43(<2 x s16>) + %90:vgpr(s32) = G_CONSTANT i32 65535 + %91:vgpr(s32) = G_CONSTANT i32 16 + %92:vgpr(s32) = G_SHL %58, %91(s32) + %93:vgpr(s32) = G_AND %57, %90 + %94:vgpr(s32) = G_OR %93, %92 + %18:vgpr(<2 x s16>) = G_BITCAST %94(s32) + %95:vgpr(s32) = G_CONSTANT i32 65535 + %96:sgpr(s32) = G_CONSTANT i32 16 + %97:sgpr(s32) = G_SHL %56, %96(s32) + %98:vgpr(s32) = G_AND %59, %95 + %99:vgpr(s32) = G_OR %98, %97 + %19:vgpr(<2 x s16>) = G_BITCAST %99(s32) + $vgpr0 = COPY %18(<2 x s16>) + $vgpr1 = COPY %19(<2 x s16>) + SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + +... +--- +name: test_v9f16 +alignment: 1 +legalized: true +regBankSelected: true +selected: false +failedISel: false +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + + ; GFX9-LABEL: name: test_v9f16 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr5 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr7 + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr8 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr9 + ; GFX9-NEXT: [[FADD:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY]], [[COPY5]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY1]], [[COPY6]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY2]], [[COPY7]] + ; GFX9-NEXT: [[FADD3:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY3]], [[COPY8]] + ; GFX9-NEXT: [[FADD4:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY4]], [[COPY9]] + ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[FADD1]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[FADD2]](<2 x s16>) + ; GFX9-NEXT: $vgpr3 = COPY [[FADD3]](<2 x s16>) + ; GFX9-NEXT: $vgpr4 = COPY [[FADD4]](<2 x s16>) + ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4 + ; GFX10-LABEL: name: test_v9f16 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr5 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr7 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr9 + ; GFX10-NEXT: [[FADD:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY]], [[COPY5]] + ; GFX10-NEXT: [[FADD1:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY1]], [[COPY6]] + ; GFX10-NEXT: [[FADD2:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY2]], [[COPY7]] + ; GFX10-NEXT: [[FADD3:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY3]], [[COPY8]] + ; GFX10-NEXT: [[FADD4:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY4]], [[COPY9]] + ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](<2 x s16>) + ; GFX10-NEXT: $vgpr1 = COPY [[FADD1]](<2 x s16>) + ; GFX10-NEXT: $vgpr2 = COPY [[FADD2]](<2 x s16>) + ; GFX10-NEXT: $vgpr3 = COPY [[FADD3]](<2 x s16>) + ; GFX10-NEXT: $vgpr4 = COPY [[FADD4]](<2 x s16>) + ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4 + %2:vgpr(<2 x s16>) = COPY $vgpr0 + %3:vgpr(<2 x s16>) = COPY $vgpr1 + %4:vgpr(<2 x s16>) = COPY $vgpr2 + %5:vgpr(<2 x s16>) = COPY $vgpr3 + %6:vgpr(<2 x s16>) = COPY $vgpr4 + %98:vgpr(s32) = G_BITCAST %2(<2 x s16>) + %99:sgpr(s32) = G_CONSTANT i32 16 + %157:vgpr(s32) = COPY %99(s32) + %100:vgpr(s32) = G_LSHR %98, %157(s32) + %101:vgpr(s32) = G_BITCAST %3(<2 x s16>) + %158:vgpr(s32) = COPY %99(s32) + %102:vgpr(s32) = G_LSHR %101, %158(s32) + %103:vgpr(s32) = G_BITCAST %4(<2 x s16>) + %159:vgpr(s32) = COPY %99(s32) + %104:vgpr(s32) = G_LSHR %103, %159(s32) + %105:vgpr(s32) = G_BITCAST %5(<2 x s16>) + %160:vgpr(s32) = COPY %99(s32) + %106:vgpr(s32) = G_LSHR %105, %160(s32) + %107:vgpr(s32) = G_BITCAST %6(<2 x s16>) + %18:vgpr(<2 x s16>) = COPY $vgpr5 + %19:vgpr(<2 x s16>) = COPY $vgpr6 + %20:vgpr(<2 x s16>) = COPY $vgpr7 + %21:vgpr(<2 x s16>) = COPY $vgpr8 + %22:vgpr(<2 x s16>) = COPY $vgpr9 + %109:vgpr(s32) = G_BITCAST %18(<2 x s16>) + %161:vgpr(s32) = COPY %99(s32) + %110:vgpr(s32) = G_LSHR %109, %161(s32) + %111:vgpr(s32) = G_BITCAST %19(<2 x s16>) + %162:vgpr(s32) = COPY %99(s32) + %112:vgpr(s32) = G_LSHR %111, %162(s32) + %113:vgpr(s32) = G_BITCAST %20(<2 x s16>) + %163:vgpr(s32) = COPY %99(s32) + %114:vgpr(s32) = G_LSHR %113, %163(s32) + %115:vgpr(s32) = G_BITCAST %21(<2 x s16>) + %164:vgpr(s32) = COPY %99(s32) + %116:vgpr(s32) = G_LSHR %115, %164(s32) + %117:vgpr(s32) = G_BITCAST %22(<2 x s16>) + %165:vgpr(s32) = G_CONSTANT i32 65535 + %166:vgpr(s32) = G_CONSTANT i32 16 + %167:vgpr(s32) = G_SHL %100, %166(s32) + %168:vgpr(s32) = G_AND %98, %165 + %169:vgpr(s32) = G_OR %168, %167 + %83:vgpr(<2 x s16>) = G_BITCAST %169(s32) + %170:vgpr(s32) = G_CONSTANT i32 65535 + %171:vgpr(s32) = G_CONSTANT i32 16 + %172:vgpr(s32) = G_SHL %102, %171(s32) + %173:vgpr(s32) = G_AND %101, %170 + %174:vgpr(s32) = G_OR %173, %172 + %84:vgpr(<2 x s16>) = G_BITCAST %174(s32) + %175:vgpr(s32) = G_CONSTANT i32 65535 + %176:vgpr(s32) = G_CONSTANT i32 16 + %177:vgpr(s32) = G_SHL %104, %176(s32) + %178:vgpr(s32) = G_AND %103, %175 + %179:vgpr(s32) = G_OR %178, %177 + %85:vgpr(<2 x s16>) = G_BITCAST %179(s32) + %180:vgpr(s32) = G_CONSTANT i32 65535 + %181:vgpr(s32) = G_CONSTANT i32 16 + %182:vgpr(s32) = G_SHL %106, %181(s32) + %183:vgpr(s32) = G_AND %105, %180 + %184:vgpr(s32) = G_OR %183, %182 + %86:vgpr(<2 x s16>) = G_BITCAST %184(s32) + %128:sgpr(s32) = G_IMPLICIT_DEF + %185:vgpr(s32) = G_CONSTANT i32 65535 + %186:sgpr(s32) = G_CONSTANT i32 16 + %187:sgpr(s32) = G_SHL %128, %186(s32) + %188:vgpr(s32) = G_AND %107, %185 + %189:vgpr(s32) = G_OR %188, %187 + %87:vgpr(<2 x s16>) = G_BITCAST %189(s32) + %190:vgpr(s32) = G_CONSTANT i32 65535 + %191:vgpr(s32) = G_CONSTANT i32 16 + %192:vgpr(s32) = G_SHL %110, %191(s32) + %193:vgpr(s32) = G_AND %109, %190 + %194:vgpr(s32) = G_OR %193, %192 + %88:vgpr(<2 x s16>) = G_BITCAST %194(s32) + %195:vgpr(s32) = G_CONSTANT i32 65535 + %196:vgpr(s32) = G_CONSTANT i32 16 + %197:vgpr(s32) = G_SHL %112, %196(s32) + %198:vgpr(s32) = G_AND %111, %195 + %199:vgpr(s32) = G_OR %198, %197 + %89:vgpr(<2 x s16>) = G_BITCAST %199(s32) + %200:vgpr(s32) = G_CONSTANT i32 65535 + %201:vgpr(s32) = G_CONSTANT i32 16 + %202:vgpr(s32) = G_SHL %114, %201(s32) + %203:vgpr(s32) = G_AND %113, %200 + %204:vgpr(s32) = G_OR %203, %202 + %90:vgpr(<2 x s16>) = G_BITCAST %204(s32) + %205:vgpr(s32) = G_CONSTANT i32 65535 + %206:vgpr(s32) = G_CONSTANT i32 16 + %207:vgpr(s32) = G_SHL %116, %206(s32) + %208:vgpr(s32) = G_AND %115, %205 + %209:vgpr(s32) = G_OR %208, %207 + %91:vgpr(<2 x s16>) = G_BITCAST %209(s32) + %210:vgpr(s32) = G_CONSTANT i32 65535 + %211:sgpr(s32) = G_CONSTANT i32 16 + %212:sgpr(s32) = G_SHL %128, %211(s32) + %213:vgpr(s32) = G_AND %117, %210 + %214:vgpr(s32) = G_OR %213, %212 + %92:vgpr(<2 x s16>) = G_BITCAST %214(s32) + %93:vgpr(<2 x s16>) = G_FADD %83, %88 + %94:vgpr(<2 x s16>) = G_FADD %84, %89 + %95:vgpr(<2 x s16>) = G_FADD %85, %90 + %96:vgpr(<2 x s16>) = G_FADD %86, %91 + %97:vgpr(<2 x s16>) = G_FADD %87, %92 + %129:vgpr(s32) = G_BITCAST %93(<2 x s16>) + %215:vgpr(s32) = COPY %99(s32) + %130:vgpr(s32) = G_LSHR %129, %215(s32) + %131:vgpr(s32) = G_BITCAST %94(<2 x s16>) + %216:vgpr(s32) = COPY %99(s32) + %132:vgpr(s32) = G_LSHR %131, %216(s32) + %133:vgpr(s32) = G_BITCAST %95(<2 x s16>) + %217:vgpr(s32) = COPY %99(s32) + %134:vgpr(s32) = G_LSHR %133, %217(s32) + %135:vgpr(s32) = G_BITCAST %96(<2 x s16>) + %218:vgpr(s32) = COPY %99(s32) + %136:vgpr(s32) = G_LSHR %135, %218(s32) + %137:vgpr(s32) = G_BITCAST %97(<2 x s16>) + %219:vgpr(s32) = G_CONSTANT i32 65535 + %220:vgpr(s32) = G_CONSTANT i32 16 + %221:vgpr(s32) = G_SHL %130, %220(s32) + %222:vgpr(s32) = G_AND %129, %219 + %223:vgpr(s32) = G_OR %222, %221 + %36:vgpr(<2 x s16>) = G_BITCAST %223(s32) + %224:vgpr(s32) = G_CONSTANT i32 65535 + %225:vgpr(s32) = G_CONSTANT i32 16 + %226:vgpr(s32) = G_SHL %132, %225(s32) + %227:vgpr(s32) = G_AND %131, %224 + %228:vgpr(s32) = G_OR %227, %226 + %37:vgpr(<2 x s16>) = G_BITCAST %228(s32) + %229:vgpr(s32) = G_CONSTANT i32 65535 + %230:vgpr(s32) = G_CONSTANT i32 16 + %231:vgpr(s32) = G_SHL %134, %230(s32) + %232:vgpr(s32) = G_AND %133, %229 + %233:vgpr(s32) = G_OR %232, %231 + %38:vgpr(<2 x s16>) = G_BITCAST %233(s32) + %234:vgpr(s32) = G_CONSTANT i32 65535 + %235:vgpr(s32) = G_CONSTANT i32 16 + %236:vgpr(s32) = G_SHL %136, %235(s32) + %237:vgpr(s32) = G_AND %135, %234 + %238:vgpr(s32) = G_OR %237, %236 + %39:vgpr(<2 x s16>) = G_BITCAST %238(s32) + %239:vgpr(s32) = G_CONSTANT i32 65535 + %240:sgpr(s32) = G_CONSTANT i32 16 + %241:sgpr(s32) = G_SHL %128, %240(s32) + %242:vgpr(s32) = G_AND %137, %239 + %243:vgpr(s32) = G_OR %242, %241 + %40:vgpr(<2 x s16>) = G_BITCAST %243(s32) + $vgpr0 = COPY %36(<2 x s16>) + $vgpr1 = COPY %37(<2 x s16>) + $vgpr2 = COPY %38(<2 x s16>) + $vgpr3 = COPY %39(<2 x s16>) + $vgpr4 = COPY %40(<2 x s16>) + SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4 + +... +--- +name: test_v11f16 +alignment: 1 +legalized: true +regBankSelected: true +selected: false +failedISel: false +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + + ; GFX9-LABEL: name: test_v11f16 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr5 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr7 + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr8 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr9 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr10 + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr11 + ; GFX9-NEXT: [[FADD:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY]], [[COPY6]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY1]], [[COPY7]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY2]], [[COPY8]] + ; GFX9-NEXT: [[FADD3:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY3]], [[COPY9]] + ; GFX9-NEXT: [[FADD4:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY4]], [[COPY10]] + ; GFX9-NEXT: [[FADD5:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY5]], [[COPY11]] + ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[FADD1]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[FADD2]](<2 x s16>) + ; GFX9-NEXT: $vgpr3 = COPY [[FADD3]](<2 x s16>) + ; GFX9-NEXT: $vgpr4 = COPY [[FADD4]](<2 x s16>) + ; GFX9-NEXT: $vgpr5 = COPY [[FADD5]](<2 x s16>) + ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; GFX10-LABEL: name: test_v11f16 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr5 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr7 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr11 + ; GFX10-NEXT: [[FADD:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY]], [[COPY6]] + ; GFX10-NEXT: [[FADD1:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY1]], [[COPY7]] + ; GFX10-NEXT: [[FADD2:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY2]], [[COPY8]] + ; GFX10-NEXT: [[FADD3:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY3]], [[COPY9]] + ; GFX10-NEXT: [[FADD4:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY4]], [[COPY10]] + ; GFX10-NEXT: [[FADD5:%[0-9]+]]:vgpr(<2 x s16>) = G_FADD [[COPY5]], [[COPY11]] + ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](<2 x s16>) + ; GFX10-NEXT: $vgpr1 = COPY [[FADD1]](<2 x s16>) + ; GFX10-NEXT: $vgpr2 = COPY [[FADD2]](<2 x s16>) + ; GFX10-NEXT: $vgpr3 = COPY [[FADD3]](<2 x s16>) + ; GFX10-NEXT: $vgpr4 = COPY [[FADD4]](<2 x s16>) + ; GFX10-NEXT: $vgpr5 = COPY [[FADD5]](<2 x s16>) + ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + %2:vgpr(<2 x s16>) = COPY $vgpr0 + %3:vgpr(<2 x s16>) = COPY $vgpr1 + %4:vgpr(<2 x s16>) = COPY $vgpr2 + %5:vgpr(<2 x s16>) = COPY $vgpr3 + %6:vgpr(<2 x s16>) = COPY $vgpr4 + %7:vgpr(<2 x s16>) = COPY $vgpr5 + %116:vgpr(s32) = G_BITCAST %2(<2 x s16>) + %117:sgpr(s32) = G_CONSTANT i32 16 + %187:vgpr(s32) = COPY %117(s32) + %118:vgpr(s32) = G_LSHR %116, %187(s32) + %119:vgpr(s32) = G_BITCAST %3(<2 x s16>) + %188:vgpr(s32) = COPY %117(s32) + %120:vgpr(s32) = G_LSHR %119, %188(s32) + %121:vgpr(s32) = G_BITCAST %4(<2 x s16>) + %189:vgpr(s32) = COPY %117(s32) + %122:vgpr(s32) = G_LSHR %121, %189(s32) + %123:vgpr(s32) = G_BITCAST %5(<2 x s16>) + %190:vgpr(s32) = COPY %117(s32) + %124:vgpr(s32) = G_LSHR %123, %190(s32) + %125:vgpr(s32) = G_BITCAST %6(<2 x s16>) + %191:vgpr(s32) = COPY %117(s32) + %126:vgpr(s32) = G_LSHR %125, %191(s32) + %127:vgpr(s32) = G_BITCAST %7(<2 x s16>) + %21:vgpr(<2 x s16>) = COPY $vgpr6 + %22:vgpr(<2 x s16>) = COPY $vgpr7 + %23:vgpr(<2 x s16>) = COPY $vgpr8 + %24:vgpr(<2 x s16>) = COPY $vgpr9 + %25:vgpr(<2 x s16>) = COPY $vgpr10 + %26:vgpr(<2 x s16>) = COPY $vgpr11 + %129:vgpr(s32) = G_BITCAST %21(<2 x s16>) + %192:vgpr(s32) = COPY %117(s32) + %130:vgpr(s32) = G_LSHR %129, %192(s32) + %131:vgpr(s32) = G_BITCAST %22(<2 x s16>) + %193:vgpr(s32) = COPY %117(s32) + %132:vgpr(s32) = G_LSHR %131, %193(s32) + %133:vgpr(s32) = G_BITCAST %23(<2 x s16>) + %194:vgpr(s32) = COPY %117(s32) + %134:vgpr(s32) = G_LSHR %133, %194(s32) + %135:vgpr(s32) = G_BITCAST %24(<2 x s16>) + %195:vgpr(s32) = COPY %117(s32) + %136:vgpr(s32) = G_LSHR %135, %195(s32) + %137:vgpr(s32) = G_BITCAST %25(<2 x s16>) + %196:vgpr(s32) = COPY %117(s32) + %138:vgpr(s32) = G_LSHR %137, %196(s32) + %139:vgpr(s32) = G_BITCAST %26(<2 x s16>) + %197:vgpr(s32) = G_CONSTANT i32 65535 + %198:vgpr(s32) = G_CONSTANT i32 16 + %199:vgpr(s32) = G_SHL %118, %198(s32) + %200:vgpr(s32) = G_AND %116, %197 + %201:vgpr(s32) = G_OR %200, %199 + %98:vgpr(<2 x s16>) = G_BITCAST %201(s32) + %202:vgpr(s32) = G_CONSTANT i32 65535 + %203:vgpr(s32) = G_CONSTANT i32 16 + %204:vgpr(s32) = G_SHL %120, %203(s32) + %205:vgpr(s32) = G_AND %119, %202 + %206:vgpr(s32) = G_OR %205, %204 + %99:vgpr(<2 x s16>) = G_BITCAST %206(s32) + %207:vgpr(s32) = G_CONSTANT i32 65535 + %208:vgpr(s32) = G_CONSTANT i32 16 + %209:vgpr(s32) = G_SHL %122, %208(s32) + %210:vgpr(s32) = G_AND %121, %207 + %211:vgpr(s32) = G_OR %210, %209 + %100:vgpr(<2 x s16>) = G_BITCAST %211(s32) + %212:vgpr(s32) = G_CONSTANT i32 65535 + %213:vgpr(s32) = G_CONSTANT i32 16 + %214:vgpr(s32) = G_SHL %124, %213(s32) + %215:vgpr(s32) = G_AND %123, %212 + %216:vgpr(s32) = G_OR %215, %214 + %101:vgpr(<2 x s16>) = G_BITCAST %216(s32) + %217:vgpr(s32) = G_CONSTANT i32 65535 + %218:vgpr(s32) = G_CONSTANT i32 16 + %219:vgpr(s32) = G_SHL %126, %218(s32) + %220:vgpr(s32) = G_AND %125, %217 + %221:vgpr(s32) = G_OR %220, %219 + %102:vgpr(<2 x s16>) = G_BITCAST %221(s32) + %152:sgpr(s32) = G_IMPLICIT_DEF + %222:vgpr(s32) = G_CONSTANT i32 65535 + %223:sgpr(s32) = G_CONSTANT i32 16 + %224:sgpr(s32) = G_SHL %152, %223(s32) + %225:vgpr(s32) = G_AND %127, %222 + %226:vgpr(s32) = G_OR %225, %224 + %103:vgpr(<2 x s16>) = G_BITCAST %226(s32) + %227:vgpr(s32) = G_CONSTANT i32 65535 + %228:vgpr(s32) = G_CONSTANT i32 16 + %229:vgpr(s32) = G_SHL %130, %228(s32) + %230:vgpr(s32) = G_AND %129, %227 + %231:vgpr(s32) = G_OR %230, %229 + %104:vgpr(<2 x s16>) = G_BITCAST %231(s32) + %232:vgpr(s32) = G_CONSTANT i32 65535 + %233:vgpr(s32) = G_CONSTANT i32 16 + %234:vgpr(s32) = G_SHL %132, %233(s32) + %235:vgpr(s32) = G_AND %131, %232 + %236:vgpr(s32) = G_OR %235, %234 + %105:vgpr(<2 x s16>) = G_BITCAST %236(s32) + %237:vgpr(s32) = G_CONSTANT i32 65535 + %238:vgpr(s32) = G_CONSTANT i32 16 + %239:vgpr(s32) = G_SHL %134, %238(s32) + %240:vgpr(s32) = G_AND %133, %237 + %241:vgpr(s32) = G_OR %240, %239 + %106:vgpr(<2 x s16>) = G_BITCAST %241(s32) + %242:vgpr(s32) = G_CONSTANT i32 65535 + %243:vgpr(s32) = G_CONSTANT i32 16 + %244:vgpr(s32) = G_SHL %136, %243(s32) + %245:vgpr(s32) = G_AND %135, %242 + %246:vgpr(s32) = G_OR %245, %244 + %107:vgpr(<2 x s16>) = G_BITCAST %246(s32) + %247:vgpr(s32) = G_CONSTANT i32 65535 + %248:vgpr(s32) = G_CONSTANT i32 16 + %249:vgpr(s32) = G_SHL %138, %248(s32) + %250:vgpr(s32) = G_AND %137, %247 + %251:vgpr(s32) = G_OR %250, %249 + %108:vgpr(<2 x s16>) = G_BITCAST %251(s32) + %252:vgpr(s32) = G_CONSTANT i32 65535 + %253:sgpr(s32) = G_CONSTANT i32 16 + %254:sgpr(s32) = G_SHL %152, %253(s32) + %255:vgpr(s32) = G_AND %139, %252 + %256:vgpr(s32) = G_OR %255, %254 + %109:vgpr(<2 x s16>) = G_BITCAST %256(s32) + %110:vgpr(<2 x s16>) = G_FADD %98, %104 + %111:vgpr(<2 x s16>) = G_FADD %99, %105 + %112:vgpr(<2 x s16>) = G_FADD %100, %106 + %113:vgpr(<2 x s16>) = G_FADD %101, %107 + %114:vgpr(<2 x s16>) = G_FADD %102, %108 + %115:vgpr(<2 x s16>) = G_FADD %103, %109 + %153:vgpr(s32) = G_BITCAST %110(<2 x s16>) + %257:vgpr(s32) = COPY %117(s32) + %154:vgpr(s32) = G_LSHR %153, %257(s32) + %155:vgpr(s32) = G_BITCAST %111(<2 x s16>) + %258:vgpr(s32) = COPY %117(s32) + %156:vgpr(s32) = G_LSHR %155, %258(s32) + %157:vgpr(s32) = G_BITCAST %112(<2 x s16>) + %259:vgpr(s32) = COPY %117(s32) + %158:vgpr(s32) = G_LSHR %157, %259(s32) + %159:vgpr(s32) = G_BITCAST %113(<2 x s16>) + %260:vgpr(s32) = COPY %117(s32) + %160:vgpr(s32) = G_LSHR %159, %260(s32) + %161:vgpr(s32) = G_BITCAST %114(<2 x s16>) + %261:vgpr(s32) = COPY %117(s32) + %162:vgpr(s32) = G_LSHR %161, %261(s32) + %163:vgpr(s32) = G_BITCAST %115(<2 x s16>) + %262:vgpr(s32) = G_CONSTANT i32 65535 + %263:vgpr(s32) = G_CONSTANT i32 16 + %264:vgpr(s32) = G_SHL %154, %263(s32) + %265:vgpr(s32) = G_AND %153, %262 + %266:vgpr(s32) = G_OR %265, %264 + %42:vgpr(<2 x s16>) = G_BITCAST %266(s32) + %267:vgpr(s32) = G_CONSTANT i32 65535 + %268:vgpr(s32) = G_CONSTANT i32 16 + %269:vgpr(s32) = G_SHL %156, %268(s32) + %270:vgpr(s32) = G_AND %155, %267 + %271:vgpr(s32) = G_OR %270, %269 + %43:vgpr(<2 x s16>) = G_BITCAST %271(s32) + %272:vgpr(s32) = G_CONSTANT i32 65535 + %273:vgpr(s32) = G_CONSTANT i32 16 + %274:vgpr(s32) = G_SHL %158, %273(s32) + %275:vgpr(s32) = G_AND %157, %272 + %276:vgpr(s32) = G_OR %275, %274 + %44:vgpr(<2 x s16>) = G_BITCAST %276(s32) + %277:vgpr(s32) = G_CONSTANT i32 65535 + %278:vgpr(s32) = G_CONSTANT i32 16 + %279:vgpr(s32) = G_SHL %160, %278(s32) + %280:vgpr(s32) = G_AND %159, %277 + %281:vgpr(s32) = G_OR %280, %279 + %45:vgpr(<2 x s16>) = G_BITCAST %281(s32) + %282:vgpr(s32) = G_CONSTANT i32 65535 + %283:vgpr(s32) = G_CONSTANT i32 16 + %284:vgpr(s32) = G_SHL %162, %283(s32) + %285:vgpr(s32) = G_AND %161, %282 + %286:vgpr(s32) = G_OR %285, %284 + %46:vgpr(<2 x s16>) = G_BITCAST %286(s32) + %287:vgpr(s32) = G_CONSTANT i32 65535 + %288:sgpr(s32) = G_CONSTANT i32 16 + %289:sgpr(s32) = G_SHL %152, %288(s32) + %290:vgpr(s32) = G_AND %163, %287 + %291:vgpr(s32) = G_OR %290, %289 + %47:vgpr(<2 x s16>) = G_BITCAST %291(s32) + $vgpr0 = COPY %42(<2 x s16>) + $vgpr1 = COPY %43(<2 x s16>) + $vgpr2 = COPY %44(<2 x s16>) + $vgpr3 = COPY %45(<2 x s16>) + $vgpr4 = COPY %46(<2 x s16>) + $vgpr5 = COPY %47(<2 x s16>) + SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -545,13 +545,7 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s0 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v3f16_xyz: @@ -564,15 +558,8 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v