Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -692,10 +692,10 @@ def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX9Insts">; -def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarryInsts()">, +def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">, AssemblerPredicate<"FeatureAddNoCarryInsts">; -def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">, +def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">, AssemblerPredicate<"!FeatureAddNoCarryInsts">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1639,6 +1639,11 @@ bit ret = !if(a, !if(b, 1, 0), 0); } +def PatGenMode { + int NoPattern = 0; + int Pattern = 1; +} + class VOPProfile _ArgVT> { field list ArgVT = _ArgVT; @@ -1706,6 +1711,7 @@ field bit HasExt = getHasExt.ret; field bit HasSDWA9 = HasExt; + field int NeedPatGen = PatGenMode.NoPattern; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -1758,6 +1764,10 @@ let HasSDWA9 = 0; } +class VOP_PAT_GEN : VOPProfile { + let NeedPatGen = mode; +} + def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>; def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -15,8 +15,8 @@ let SubtargetPredicate = isGCN; } -include "VOPInstructions.td" include "SOPInstructions.td" +include "VOPInstructions.td" include "SMInstructions.td" include "FLATInstructions.td" include "BUFInstructions.td" @@ -727,12 +727,14 @@ defm : SelectPat ; defm : SelectPat ; +let AddedComplexity = 1 in { def : GCNPat < - (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), + (i32 (add (i32 (getDivergentFrag.ret i32:$popcnt)), i32:$val)), (V_BCNT_U32_B32_e64 $popcnt, $val) >; +} def : GCNPat < - (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)), + (i16 (add (i16 (trunc (getDivergentFrag.ret i32:$popcnt))), i16:$val)), (V_BCNT_U32_B32_e64 $popcnt, $val) >; Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -336,42 +336,48 @@ "$sdst, $src0, $src1", pattern >; +class UniformBinFrag : PatFrag < + (ops node:$src0, node:$src1), + (Op $src0, $src1), + [{ return !N->isDivergent(); }] +>; + let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { def S_ADD_U32 : SOP2_32 <"s_add_u32">; def S_ADD_I32 : SOP2_32 <"s_add_i32", - [(set i32:$sdst, (add SSrc_b32:$src0, SSrc_b32:$src1))] + [(set i32:$sdst, (UniformBinFrag SSrc_b32:$src0, SSrc_b32:$src1))] >; } // End isCommutable = 1 def S_SUB_U32 : SOP2_32 <"s_sub_u32">; def S_SUB_I32 : SOP2_32 <"s_sub_i32", - [(set i32:$sdst, (sub SSrc_b32:$src0, SSrc_b32:$src1))] + [(set i32:$sdst, (UniformBinFrag SSrc_b32:$src0, SSrc_b32:$src1))] >; let Uses = [SCC] in { // Carry in comes from SCC let isCommutable = 1 in { def S_ADDC_U32 : SOP2_32 <"s_addc_u32", - [(set i32:$sdst, (adde (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; + [(set i32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; } // End isCommutable = 1 def S_SUBB_U32 : SOP2_32 <"s_subb_u32", - [(set i32:$sdst, (sube (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; + [(set i32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; } // End Uses = [SCC] let isCommutable = 1 in { def S_MIN_I32 : SOP2_32 <"s_min_i32", - [(set i32:$sdst, (smin i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; def S_MIN_U32 : SOP2_32 <"s_min_u32", - [(set i32:$sdst, (umin i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; def S_MAX_I32 : SOP2_32 <"s_max_i32", - [(set i32:$sdst, (smax i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; def S_MAX_U32 : SOP2_32 <"s_max_u32", - [(set i32:$sdst, (umax i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; } // End isCommutable = 1 } // End Defs = [SCC] @@ -385,27 +391,27 @@ let Defs = [SCC] in { let isCommutable = 1 in { def S_AND_B32 : SOP2_32 <"s_and_b32", - [(set i32:$sdst, (and i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; def S_AND_B64 : SOP2_64 <"s_and_b64", - [(set i64:$sdst, (and i64:$src0, i64:$src1))] + [(set i64:$sdst, (UniformBinFrag i64:$src0, i64:$src1))] >; def S_OR_B32 : SOP2_32 <"s_or_b32", - [(set i32:$sdst, (or i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; def S_OR_B64 : SOP2_64 <"s_or_b64", - [(set i64:$sdst, (or i64:$src0, i64:$src1))] + [(set i64:$sdst, (UniformBinFrag i64:$src0, i64:$src1))] >; def S_XOR_B32 : SOP2_32 <"s_xor_b32", - [(set i32:$sdst, (xor i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; def S_XOR_B64 : SOP2_64 <"s_xor_b64", - [(set i64:$sdst, (xor i64:$src0, i64:$src1))] + [(set i64:$sdst, (UniformBinFrag i64:$src0, i64:$src1))] >; def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", @@ -431,20 +437,21 @@ let AddedComplexity = 1 in { let Defs = [SCC] in { +// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32", - [(set i32:$sdst, (shl i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64", [(set i64:$sdst, (shl i64:$src0, i32:$src1))] >; def S_LSHR_B32 : SOP2_32 <"s_lshr_b32", - [(set i32:$sdst, (srl i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64", [(set i64:$sdst, (srl i64:$src0, i32:$src1))] >; def S_ASHR_I32 : SOP2_32 <"s_ashr_i32", - [(set i32:$sdst, (sra i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))] >; def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64", [(set i64:$sdst, (sra i64:$src0, i32:$src1))] @@ -452,8 +459,10 @@ } // End Defs = [SCC] def S_BFM_B32 : SOP2_32 <"s_bfm_b32", - [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>; + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))]>; def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">; + +// TODO: S_MUL_I32 require V_MUL_LO_I32 from VOP3 change def S_MUL_I32 : SOP2_32 <"s_mul_i32", [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> { let isCommutable = 1; Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -124,7 +124,7 @@ let renamedInGFX9 = GFX9Renamed in { - def _e32 : VOP2_Pseudo , + def _e32 : VOP2_Pseudo .ret>, Commutable_REV; def _e64 : VOP3_Pseudo .ret>, @@ -144,7 +144,7 @@ let renamedInGFX9 = GFX9Renamed in { let SchedRW = [Write32Bit, WriteSALU] in { let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { - def _e32 : VOP2_Pseudo , + def _e32 : VOP2_Pseudo .ret>, Commutable_REV; def _sdwa : VOP2_SDWA_Pseudo { @@ -352,10 +352,10 @@ // VOP2 Instructions //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN in { +let SubtargetPredicate = isGCN, Predicates = [isGCN] in { defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; -def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">; +def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>; @@ -363,29 +363,29 @@ defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">; defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>; defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>; -defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>; -defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>; -defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>; -defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>; +defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_PAT_GEN, AMDGPUmul_i24>; +defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN, AMDGPUmulhi_i24>; +defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN, AMDGPUmul_u24>; +defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN, AMDGPUmulhi_u24>; defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>; defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>; -defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>; -defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>; -defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>; -defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>; +defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN, smin>; +defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN, smax>; +defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN, umin>; +defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN, umax>; defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">; defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">; defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">; -defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>; -defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>; -defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>; +defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN, and>; +defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN, or>; +defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN, xor>; let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; } -def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">; +def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>; // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. @@ -411,11 +411,11 @@ // These are special and do not read the exec mask. let isConvergent = 1, Uses = [] in { def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, - [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">; + [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, - [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">; + [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>; } // End $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 @@ -431,7 +431,7 @@ defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT, AMDGPUpk_u16_u32>; defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT, AMDGPUpk_i16_i32>; -} // End SubtargetPredicate = isGCN +} // End SubtargetPredicate = isGCN, Predicates = [isGCN] def : GCNPat< (AMDGPUadde i32:$src0, i32:$src1, i1:$src2), @@ -444,19 +444,73 @@ >; // These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { +let SubtargetPredicate = isSICI, Predicates = [isSICI] in { defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>; defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>; let isCommutable = 1 in { defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN, srl>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN, sra>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN, shl>; } // End isCommutable = 1 -} // End let SubtargetPredicate = SICI +} // End let SubtargetPredicate = SICI, Predicates = [isSICI] + +class DivergentBinOp : + GCNPat< + (getDivergentFrag.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1), + !if(!cast(Inst).IsOrig, + (Inst $src0, $src1), + (Inst $src1, $src0) + ) + >; + +let AddedComplexity = 1 in { + def : DivergentBinOp; + def : DivergentBinOp; + def : DivergentBinOp; +} + +let SubtargetPredicate = HasAddNoCarryInsts in { + def : DivergentBinOp; + def : DivergentBinOp; + def : DivergentBinOp; +} + + +def : DivergentBinOp; + +def : DivergentBinOp; +def : DivergentBinOp; + +def : DivergentBinOp; + +def : DivergentBinOp; +def : DivergentBinOp; +def : DivergentBinOp; +def : DivergentBinOp; +def : DivergentBinOp; + +class divergent_i64_BinOp : + GCNPat< + (getDivergentFrag.ret i64:$src0, i64:$src1), + (REG_SEQUENCE VReg_64, + (Inst + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)) + ), sub0, + (Inst + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)) + ), sub1 + ) + >; + +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; let SubtargetPredicate = Has16BitInsts in { Index: lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPInstructions.td +++ lib/Target/AMDGPU/VOPInstructions.td @@ -528,6 +528,50 @@ let DecoderNamespace = "DPP"; } +class getNumNodeArgs { + SDNode N = !cast(Op); + SDTypeProfile TP = N.TypeProfile; + int ret = TP.NumOperands; +} + + +class getDivergentFrag { + + int NumSrcArgs = getNumNodeArgs.ret; + PatFrag ret = PatFrag < + !if(!eq(NumSrcArgs, 1), + (ops node:$src0), + !if(!eq(NumSrcArgs, 2), + (ops node:$src0, node:$src1), + (ops node:$src0, node:$src1, node:$src2))), + !if(!eq(NumSrcArgs, 1), + (Op $src0), + !if(!eq(NumSrcArgs, 2), + (Op $src0, $src1), + (Op $src0, $src1, $src2))), + [{ return N->isDivergent(); }] + >; +} + +class VOPPatGen { + + PatFrag Operator = getDivergentFrag < Op >.ret; + + dag Ins = !foreach(tmp, P.Ins32, !subst(ins, Operator, + !subst(P.Src0RC32, P.Src0VT, + !subst(P.Src1RC32, P.Src1VT, tmp)))); + + + dag Outs = !foreach(tmp, P.Outs32, !subst(outs, set, + !subst(P.DstRC, P.DstVT, tmp))); + + list ret = [!con(Outs, (set Ins))]; +} + +class VOPPatOrNull { + list ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen.ret, []); +} + include "VOPCInstructions.td" include "VOP1Instructions.td" include "VOP2Instructions.td" Index: test/CodeGen/AMDGPU/add.ll =================================================================== --- test/CodeGen/AMDGPU/add.ll +++ test/CodeGen/AMDGPU/add.ll @@ -121,7 +121,7 @@ ; FUNC-LABEL: {{^}}v_add_i32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[B]], [[A]] +; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[A]], [[B]] ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() Index: test/CodeGen/AMDGPU/amdgcn.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgcn.private-memory.ll +++ test/CodeGen/AMDGPU/amdgcn.private-memory.ll @@ -13,7 +13,7 @@ ; GCN-LABEL: {{^}}work_item_info: ; GCN-NOT: v0 -; GCN: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}} +; GCN: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v{{[0-9]+}}, v0 ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) { entry: Index: test/CodeGen/AMDGPU/bfe-patterns.ll =================================================================== --- test/CodeGen/AMDGPU/bfe-patterns.ll +++ test/CodeGen/AMDGPU/bfe-patterns.ll @@ -24,11 +24,8 @@ ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] ; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]] -; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]] -; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]] - -; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] -; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] +; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] +; GCN-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] ; GCN: [[BFE]] ; GCN: [[SHL]] @@ -100,11 +97,8 @@ ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]] ; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]] -; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]] -; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]] - -; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] -; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] +; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]] +; GCN-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]] ; GCN: [[BFE]] ; GCN: [[SHL]] Index: test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop64.ll +++ test/CodeGen/AMDGPU/ctpop64.ll @@ -188,7 +188,7 @@ ; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 ; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]] -; GCN: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]] +; GCN: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT2]], [[MIDRESULT1]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/extract-lowbits.ll =================================================================== --- test/CodeGen/AMDGPU/extract-lowbits.ll +++ test/CodeGen/AMDGPU/extract-lowbits.ll @@ -169,8 +169,8 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshl_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshr_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bzhi32_d1_indexzext: Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -102,11 +102,10 @@ ; GCN-LABEL: {{^}}v_fabs_fold_self_v2f16: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_cvt_f32_f16_e32 -; CI: v_cvt_f32_f16_e32 -; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshrrev_b32_e32 [[VREG:v[0-9]+]], 16, v{{[0-9]+}} +; CI: v_cvt_f32_f16_e32 [[NORM:v[0-9]+]], [[VREG]] +; CI: v_cvt_f32_f16_e64 [[ABS:v[0-9]+]], {{\|}}[[VREG]]{{\|}} +; CI: v_mul_f32_e32 v{{[0-9]+}}, [[ABS]], [[NORM]] ; CI: v_cvt_f16_f32 ; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_cvt_f16_f32 Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -286,7 +286,7 @@ ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]] ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]] -; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { @@ -306,7 +306,7 @@ ; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]] -; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]] ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { @@ -373,7 +373,7 @@ ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]] ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]] -; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { @@ -393,7 +393,7 @@ ; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]] -; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]] ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { @@ -534,7 +534,7 @@ ; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16 ; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]] -; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], [[COPY_VAL]], v[[LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]] @@ -588,7 +588,7 @@ ; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16 ; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]] -; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], [[COPY_VAL]], v[[HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]] Index: test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -15,9 +15,9 @@ } ; VI-LABEL: {{^}}dpp_test1: -; VI-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} ; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; VI-NOOPT: v_mov_b32_e32 [[REG:v[0-9]+]], v{{[0-9]+}} +; VI-NOOPT0: v_mov_b32_e32 [[REG:v[0-9]+]], v{{[0-9]+}} ; VI-NEXT: s_nop 0 ; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_dpp v2, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf Index: test/CodeGen/AMDGPU/shift-i128.ll =================================================================== --- test/CodeGen/AMDGPU/shift-i128.ll +++ test/CodeGen/AMDGPU/shift-i128.ll @@ -4,26 +4,24 @@ define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) { ; GCN-LABEL: v_shl_i128_vv: ; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshl_b64 v[5:6], v[0:1], v4 -; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v4 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 64, v4 -; GCN-NEXT: v_subrev_i32_e32 v11, vcc, 64, v4 -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GCN-NEXT: v_lshr_b64 v[9:10], v[0:1], v9 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v10 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v0, v5 -; GCN-NEXT: v_mov_b32_e32 v1, v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_b64 v[5:6], v[2:3], v4 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 64, v4 +; GCN-NEXT: v_subrev_i32_e32 v11, vcc, 64, v4 +; GCN-NEXT: v_lshl_b64 v[7:8], v[0:1], v4 +; GCN-NEXT: v_lshr_b64 v[9:10], v[0:1], v9 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11 +; GCN-NEXT: v_or_b32_e32 v6, v6, v10 +; GCN-NEXT: v_or_b32_e32 v5, v5, v9 +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GCN-NEXT: v_cndmask_b32_e32 v6, v1, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 %lhs, %rhs ret i128 %shl } @@ -32,25 +30,24 @@ ; GCN-LABEL: v_lshr_i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshr_b64 v[5:6], v[2:3], v4 -; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v4 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 64, v4 -; GCN-NEXT: v_subrev_i32_e32 v11, vcc, 64, v4 -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GCN-NEXT: v_lshl_b64 v[9:10], v[2:3], v9 -; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v11 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_or_b32_e32 v8, v8, v10 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, v5 -; GCN-NEXT: v_mov_b32_e32 v3, v6 -; GCN-NEXT: s_setpc_b64 s[30:31] + +; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 64, v4 +; GCN-NEXT: v_subrev_i32_e32 v11, vcc, 64, v4 +; GCN-NEXT: v_lshr_b64 v[7:8], v[2:3], v4 +; GCN-NEXT: v_lshl_b64 v[9:10], v[2:3], v9 +; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v11 +; GCN-NEXT: v_or_b32_e32 v6, v6, v10 +; GCN-NEXT: v_or_b32_e32 v5, v5, v9 +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GCN-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v8, vcc +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] %shl = lshr i128 %lhs, %rhs ret i128 %shl } @@ -69,8 +66,8 @@ ; GCN-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; GCN-NEXT: v_lshl_b64 v[9:10], v[2:3], v10 ; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], v11 -; GCN-NEXT: v_or_b32_e32 v7, v7, v9 ; GCN-NEXT: v_or_b32_e32 v8, v8, v10 +; GCN-NEXT: v_or_b32_e32 v7, v7, v9 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 @@ -497,19 +494,19 @@ ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v8 ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v18 -; GCN-NEXT: v_or_b32_e32 v20, v16, v18 -; GCN-NEXT: v_or_b32_e32 v21, v17, v19 +; GCN-NEXT: v_or_b32_e32 v20, v17, v19 +; GCN-NEXT: v_or_b32_e32 v21, v16, v18 ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v12 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v16 ; GCN-NEXT: v_lshl_b64 v[18:19], v[6:7], v12 -; GCN-NEXT: v_or_b32_e32 v16, v18, v16 ; GCN-NEXT: v_or_b32_e32 v17, v19, v17 +; GCN-NEXT: v_or_b32_e32 v16, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] -; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] -; GCN-NEXT: v_or_b32_e32 v14, v12, v14 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 +; GCN-NEXT: v_or_b32_e32 v14, v12, v14 ; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[8:9] ; GCN-NEXT: v_subrev_i32_e32 v18, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[8:9], v[0:1], v8 @@ -519,8 +516,8 @@ ; GCN-NEXT: v_lshl_b64 v[12:13], v[4:5], v12 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v18 ; GCN-NEXT: s_and_b64 vcc, s[6:7], s[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v18, v1, v21, vcc -; GCN-NEXT: v_cndmask_b32_e32 v19, v0, v20, vcc +; GCN-NEXT: v_cndmask_b32_e32 v18, v1, v20, vcc +; GCN-NEXT: v_cndmask_b32_e32 v19, v0, v21, vcc ; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[12:13] ; GCN-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[6:7] ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v16, s[6:7] @@ -546,19 +543,19 @@ ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8 ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18 -; GCN-NEXT: v_or_b32_e32 v20, v16, v18 -; GCN-NEXT: v_or_b32_e32 v21, v17, v19 +; GCN-NEXT: v_or_b32_e32 v20, v17, v19 +; GCN-NEXT: v_or_b32_e32 v21, v16, v18 ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v12 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[4:5], v12 -; GCN-NEXT: v_or_b32_e32 v16, v18, v16 ; GCN-NEXT: v_or_b32_e32 v17, v19, v17 +; GCN-NEXT: v_or_b32_e32 v16, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] -; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] -; GCN-NEXT: v_or_b32_e32 v14, v12, v14 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 +; GCN-NEXT: v_or_b32_e32 v14, v12, v14 ; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[8:9] ; GCN-NEXT: v_subrev_i32_e32 v18, vcc, 64, v8 ; GCN-NEXT: v_lshr_b64 v[8:9], v[2:3], v8 @@ -568,8 +565,8 @@ ; GCN-NEXT: v_lshr_b64 v[12:13], v[6:7], v12 ; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v18 ; GCN-NEXT: s_and_b64 vcc, s[6:7], s[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v18, v3, v21, vcc -; GCN-NEXT: v_cndmask_b32_e32 v19, v2, v20, vcc +; GCN-NEXT: v_cndmask_b32_e32 v18, v3, v20, vcc +; GCN-NEXT: v_cndmask_b32_e32 v19, v2, v21, vcc ; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[12:13] ; GCN-NEXT: v_cndmask_b32_e64 v17, v7, v17, s[6:7] ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[6:7] @@ -595,25 +592,25 @@ ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8 ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18 -; GCN-NEXT: v_or_b32_e32 v20, v16, v18 -; GCN-NEXT: v_or_b32_e32 v21, v17, v19 +; GCN-NEXT: v_or_b32_e32 v20, v17, v19 +; GCN-NEXT: v_or_b32_e32 v21, v16, v18 ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v12 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[4:5], v12 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] -; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] -; GCN-NEXT: v_or_b32_e32 v14, v12, v14 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 +; GCN-NEXT: v_or_b32_e32 v14, v12, v14 ; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[8:9] ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 ; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v21, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v20, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7] ; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[12:13] ; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8 ; GCN-NEXT: v_ashrrev_i32_e32 v20, 31, v3 Index: test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/shl.v2i16.ll +++ test/CodeGen/AMDGPU/shl.v2i16.ll @@ -50,7 +50,7 @@ ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]] ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI: v_lshl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -6,7 +6,7 @@ ; FIXME: Why is this commuted only sometimes? ; GCN-LABEL: {{^}}i32_fastcc_i32_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 +; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { @@ -16,7 +16,7 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 +; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN: s_mov_b32 s5, s32 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24 @@ -88,7 +88,7 @@ ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 +; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -130,9 +130,9 @@ ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 -; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v1, v0 -; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_0]], v0 -; CIVI: v_add_{{i|u}}32_e32 v0, vcc, [[LOAD_1]], v0 +; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 +; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] +; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]] ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -93,7 +93,7 @@ } ; FUNC-LABEL: {{^}}test_sub_i16: -; SI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, +; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, ; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x()