Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -88,6 +88,12 @@ [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>; +def remove_or_and_shl : GICombineRule< + (defs root:$rm_bitcast, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_BITCAST):$rm_bitcast, + [{ return RegBankHelper.matchCombineOrAndShl(*${rm_bitcast}, ${matchinfo}); }]), + (apply [{ Helper.replaceSingleDefInstWithReg(*${rm_bitcast}, ${matchinfo}); }])>; + def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; def remove_fcanonicalize : GICombineRule< @@ -128,7 +134,8 @@ def AMDGPURegBankCombinerHelper : GICombinerHelper< "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, - fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { + fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, + remove_or_and_shl]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; Index: llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -73,6 +73,7 @@ bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg); void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); void applyClamp(MachineInstr &MI, Register &Reg); + bool matchCombineOrAndShl(MachineInstr &MI, Register &NewReg); private: AMDGPU::SIModeRegisterDefaults getMode(); @@ -325,6 +326,50 @@ MI.eraseFromParent(); } +/// Combiner that removes the unnecessary expression +/// G_BITCAST (G_OR ((G_AND X, 65535), (G_SHL Y, 16))) +/// and uses X or Y instead, depending on what instructions +/// they are. +/// In case X and Y are independant, the combiner will do nothing. +/// If X or Y is G_IMPLICIT_DEF, the result will be the other non-implicit +/// register. The result of the previous expression is similar, but +/// instead of having the same value as X or Y, it only holds the higher +/// or lower 16 bits of the non-implicit register. +bool AMDGPURegBankCombinerHelper::matchCombineOrAndShl(MachineInstr &MI, + Register &NewReg) { + Register SrcReg = MI.getOperand(0).getReg(); + MachineInstr *ShlSrcMI, *AndSrcMI; + + if (mi_match( + SrcReg, MRI, + m_GBitcast(m_GOr(m_GAnd(m_MInstr(AndSrcMI), m_SpecificICst(0xffff)), + m_GShl(m_MInstr(ShlSrcMI), m_SpecificICst(16)))))) { + if (ShlSrcMI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF && + AndSrcMI->getOpcode() == TargetOpcode::G_BITCAST) { + NewReg = AndSrcMI->getOperand(1).getReg(); + return true; + } else if (AndSrcMI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF && + ShlSrcMI->getOpcode() == TargetOpcode::G_BITCAST) { + NewReg = ShlSrcMI->getOperand(1).getReg(); + return true; + } else if (ShlSrcMI->getOpcode() == TargetOpcode::G_LSHR && + ShlSrcMI->getOperand(1).getReg() == + AndSrcMI->getOperand(0).getReg()) { + MachineInstr *ShlCstArg = + getDefIgnoringCopies(ShlSrcMI->getOperand(2).getReg(), MRI); + + if (ShlCstArg->getOpcode() == TargetOpcode::G_CONSTANT && + ShlCstArg->getOperand(1).isCImm() && + ShlCstArg->getOperand(1).getCImm()->equalsInt(16)) { + NewReg = AndSrcMI->getOperand(1).getReg(); + return true; + } + } + } + + return false; +} + AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { return MF.getInfo()->getMode(); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -786,229 +786,69 @@ ; GFX9-LABEL: test_3xhalf_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_pk_add_f16 v1, v4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX9-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v5, v5, v9, s4 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-DENORM-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-DENORM-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v4, v1 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-UNSAFE: ; %bb.0: ; %.entry ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v5, v5, v9, s4 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_3xhalf_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_and_or_b32 v2, v2, v8, v7 -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-NEXT: v_and_or_b32 v2, v5, v8, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v8 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v2, v8, v7 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v5, v8, s4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v8 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <3 x half> %x, %y Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-and-shl.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-and-shl.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s + +define amdgpu_vs <3 x half> @test_v3f16(<3 x half> %x, <3 x half> %y) { +; GFX9-LABEL: test_v3f16: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_v3f16: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_add_f16 v1, v1, v3 +; GFX10-NEXT: ; return to shader part epilog +.entry: + %a = fadd <3 x half> %x, %y + ret <3 x half> %a +} + +define amdgpu_vs <9 x half> @test_v9f16(<9 x half> %x, <9 x half> %y) { +; GFX9-LABEL: test_v9f16: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_pk_add_f16 v0, v0, v5 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v6 +; GFX9-NEXT: v_pk_add_f16 v2, v2, v7 +; GFX9-NEXT: v_pk_add_f16 v3, v3, v8 +; GFX9-NEXT: v_pk_add_f16 v4, v4, v9 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_v9f16: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_pk_add_f16 v0, v0, v5 +; GFX10-NEXT: v_pk_add_f16 v1, v1, v6 +; GFX10-NEXT: v_pk_add_f16 v2, v2, v7 +; GFX10-NEXT: v_pk_add_f16 v3, v3, v8 +; GFX10-NEXT: v_pk_add_f16 v4, v4, v9 +; GFX10-NEXT: ; return to shader part epilog +.entry: + %a = fadd <9 x half> %x, %y + ret <9 x half> %a +} + +define amdgpu_vs <11 x half> @test_v11f16(<11 x half> %x, <11 x half> %y) { +; GFX9-LABEL: test_v11f16: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_pk_add_f16 v0, v0, v6 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v7 +; GFX9-NEXT: v_pk_add_f16 v2, v2, v8 +; GFX9-NEXT: v_pk_add_f16 v3, v3, v9 +; GFX9-NEXT: v_pk_add_f16 v4, v4, v10 +; GFX9-NEXT: v_pk_add_f16 v5, v5, v11 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_v11f16: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: v_pk_add_f16 v0, v0, v6 +; GFX10-NEXT: v_pk_add_f16 v1, v1, v7 +; GFX10-NEXT: v_pk_add_f16 v2, v2, v8 +; GFX10-NEXT: v_pk_add_f16 v3, v3, v9 +; GFX10-NEXT: v_pk_add_f16 v4, v4, v10 +; GFX10-NEXT: v_pk_add_f16 v5, v5, v11 +; GFX10-NEXT: ; return to shader part epilog +.entry: + %a = fadd <11 x half> %x, %y + ret <11 x half> %a +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -545,13 +545,7 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s0 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v3f16_xyz: @@ -564,15 +558,8 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v