diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -314,16 +314,16 @@ MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32}, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, - {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, - MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, - MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, - MVT::v6f32, MVT::v6i32, MVT::v7f32, MVT::v7i32, - MVT::v8f32, MVT::v8i32, MVT::v16f32, MVT::v16i32, - MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, - MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64, - MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, - Custom); + setOperationAction( + ISD::EXTRACT_SUBVECTOR, + {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32, + MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32, + MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32, + MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v16f16, MVT::v16i16, + MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64, + MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64, + MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, + Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom); @@ -1281,6 +1281,11 @@ (Start == 0 || Start == 4)) return Op; + if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) || + (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) && + (Start == 0 || Start == 8)) + return Op; + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -138,6 +138,8 @@ addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); + addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -235,12 +237,12 @@ // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, - MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, - MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, - MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64, - MVT::v32i32, MVT::v32f32 }) { + for (MVT VT : + {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, + MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64, + MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, + MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, + MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -350,7 +352,7 @@ // TODO: Generalize to more vector types. setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v4i16, MVT::v4f16}, + MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16}, Custom); // Deal with vec3 vector operations when widened to vec4. @@ -507,7 +509,7 @@ setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, - MVT::v8f16}) { + MVT::v8f16, MVT::v16i16, MVT::v16f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -581,6 +583,16 @@ setOperationAction(ISD::STORE, MVT::v8f16, Promote); AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v16i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v16f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v16i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32); + setOperationAction(ISD::STORE, MVT::v16f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); + setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, MVT::v2i32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); @@ -603,12 +615,12 @@ setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, - {MVT::v4f16, MVT::v8f16}, Custom); + {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom); - setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::v4f16, MVT::v8f16}, - Expand); + setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, + {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand); - for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) { + for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) { setOperationAction( {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, Vec16, Custom); @@ -630,10 +642,11 @@ Custom); setOperationAction(ISD::VECTOR_SHUFFLE, - {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16}, + {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, + MVT::v16f16, MVT::v16i16}, Custom); - for (MVT VT : {MVT::v4i16, MVT::v8i16}) + for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16}) // Split vector operations. setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, @@ -641,7 +654,7 @@ ISD::SSUBSAT}, VT, Custom); - for (MVT VT : {MVT::v4f16, MVT::v8f16}) + for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16}) // Split vector operations. setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, VT, Custom); @@ -677,7 +690,7 @@ setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v8i16, MVT::v8f16}, + MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}, Custom); setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); @@ -4547,8 +4560,9 @@ unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 || - VT == MVT::v16f32 || VT == MVT::v32f32); + VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || + VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4570,8 +4584,9 @@ unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || - VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 || - VT == MVT::v16f32 || VT == MVT::v32f32); + VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || + VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32); SDValue Lo0, Hi0; SDValue Op0 = Op.getOperand(0); @@ -5270,7 +5285,7 @@ if (IsIEEEMode) return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); - if (VT == MVT::v4f16 || VT == MVT::v8f16) + if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16) return splitBinaryVectorOp(Op, DAG); return Op; } @@ -5720,17 +5735,35 @@ if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; - if (VecSize == 128) { + if (VecSize == 128 || VecSize == 256) { SDValue Lo, Hi; EVT LoVT, HiVT; - SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); - Lo = - DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, - V2, DAG.getConstant(0, SL, MVT::i32))); - Hi = - DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, - V2, DAG.getConstant(1, SL, MVT::i32))); + + if (VecSize == 128) { + SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); + Lo = DAG.getBitcast(LoVT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, + DAG.getConstant(0, SL, MVT::i32))); + Hi = DAG.getBitcast(HiVT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, + DAG.getConstant(1, SL, MVT::i32))); + } else { + assert(VecSize == 256); + + SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec); + SDValue Parts[4]; + for (unsigned P = 0; P < 4; ++P) { + Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, + DAG.getConstant(P, SL, MVT::i32)); + } + + Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, + Parts[0], Parts[1])); + Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, + Parts[2], Parts[3])); + } + EVT IdxVT = Idx.getValueType(); unsigned NElem = VecVT.getVectorNumElements(); assert(isPowerOf2_32(NElem)); @@ -5873,6 +5906,27 @@ return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } + if (VT == MVT::v16i16 || VT == MVT::v16f16) { + EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), + VT.getVectorNumElements() / 4); + MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); + + SmallVector Parts[4]; + for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) { + for (unsigned P = 0; P < 4; ++P) + Parts[P].push_back(Op.getOperand(I + P * E)); + } + SDValue Casts[4]; + for (unsigned P = 0; P < 4; ++P) { + SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); + Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); + } + + SDValue Blend = + DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + } + assert(VT == MVT::v2f16 || VT == MVT::v2i16); assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); @@ -8674,7 +8728,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT.getSizeInBits() == 128) + if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) return splitTernaryVectorOp(Op, DAG); assert(VT.getSizeInBits() == 64); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1258,6 +1258,26 @@ (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3)) >; +def : Pat < + (extract_subvector v16i16:$vec, (i32 0)), + (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3)) +>; + +def : Pat < + (extract_subvector v16i16:$vec, (i32 8)), + (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7)) +>; + +def : Pat < + (extract_subvector v16f16:$vec, (i32 0)), + (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3)) +>; + +def : Pat < + (extract_subvector v16f16:$vec, (i32 8)), + (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7)) +>; + foreach Index = 0-31 in { def Extract_Element_v32i32_#Index : Extract_Element < i32, v32i32, Index, !cast(sub#Index) @@ -1417,7 +1437,18 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; - +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; // 512-bit bitcast def : BitConvert ; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -786,7 +786,7 @@ defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; -defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<8, 19, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; defm "" : SRegClass<16, 20, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; defm "" : SRegClass<32, 21, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; @@ -827,7 +827,7 @@ defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>; -defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>; +defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>; defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -76,7 +76,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef -; FAST16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17i16 = add <17 x i16> undef, undef +; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW16-LABEL: 'add_i16' @@ -98,7 +98,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17i16 = add <17 x i16> undef, undef +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW16-SIZE-LABEL: 'add_i16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll @@ -57,8 +57,8 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -115,8 +115,8 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -237,8 +237,8 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -295,8 +295,8 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll @@ -57,8 +57,8 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -115,8 +115,8 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -237,8 +237,8 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -295,8 +295,8 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -115,7 +115,7 @@ ; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fadd <17 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fadd_f16' @@ -135,7 +135,7 @@ ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fadd <17 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fadd_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -165,7 +165,7 @@ ; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ieee' @@ -185,7 +185,7 @@ ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 512 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f16 = fdiv half undef, undef @@ -216,7 +216,7 @@ ; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ftzdaz' @@ -236,7 +236,7 @@ ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 512 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f16 = fdiv half undef, undef diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -145,7 +145,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2 ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2 ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2 -; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 +; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_f16' @@ -165,7 +165,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2 -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -115,7 +115,7 @@ ; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fmul <17 x half> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_f16' @@ -135,7 +135,7 @@ ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fmul <17 x half> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll b/llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll @@ -182,7 +182,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> -; FAST-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> +; FAST-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; SLOW-LABEL: 'fptosi_float_i16' @@ -198,7 +198,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLOW-SIZE-LABEL: 'fptosi_float_i16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll b/llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll @@ -182,7 +182,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> -; FAST-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> +; FAST-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; SLOW-LABEL: 'fptoui_float_i16' @@ -198,7 +198,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLOW-SIZE-LABEL: 'fptoui_float_i16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -115,7 +115,7 @@ ; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fsub <17 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fsub_f16' @@ -135,7 +135,7 @@ ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fsub <17 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fsub_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -79,7 +79,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i16 = mul <4 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5i16 = mul <5 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = mul <16 x i16> undef, undef -; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17i16 = mul <17 x i16> undef, undef +; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v17i16 = mul <17 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW16-SIZE-LABEL: 'mul_i16' @@ -99,7 +99,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = mul <4 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = mul <5 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = mul <16 x i16> undef, undef -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = mul <17 x i16> undef, undef +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v17i16 = mul <17 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %i16 = mul i16 undef, undef @@ -144,7 +144,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -182,7 +182,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, @@ -244,7 +244,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -282,7 +282,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, @@ -344,7 +344,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -382,7 +382,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, @@ -444,7 +444,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -482,7 +482,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -2,8 +2,8 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s -define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { -; SI-LABEL: extract_4xi16: +define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { +; SI-LABEL: vec_8xi16_extract_4xi16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cbranch_scc0 .LBB0_2 @@ -90,7 +90,7 @@ ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: extract_4xi16: +; GFX9-LABEL: vec_8xi16_extract_4xi16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 @@ -143,8 +143,8 @@ ret <4 x i16> %r2 } -define <4 x i16> @extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { -; SI-LABEL: extract_4xi16_2: +define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { +; SI-LABEL: vec_8xi16_extract_4xi16_2: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cbranch_scc0 .LBB1_2 @@ -234,7 +234,7 @@ ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: extract_4xi16_2: +; GFX9-LABEL: vec_8xi16_extract_4xi16_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 @@ -287,8 +287,8 @@ ret <4 x i16> %r2 } -define <4 x half> @extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) { -; SI-LABEL: extract_4xf16: +define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) { +; SI-LABEL: vec_8xf16_extract_4xf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cbranch_scc0 .LBB2_2 @@ -376,7 +376,7 @@ ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: extract_4xf16: +; GFX9-LABEL: vec_8xf16_extract_4xf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 @@ -432,3 +432,578 @@ %r2 = select <4 x i1> %b2, <4 x half> , <4 x half> ret <4 x half> %r2 } + +define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) { +; +; SI-LABEL: vec_16xi16_extract_4xi16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cbranch_scc0 .LBB3_2 +; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB3_3 +; SI-NEXT: s_branch .LBB3_4 +; SI-NEXT: .LBB3_2: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB3_3: ; %T +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: .LBB3_4: ; %exit +; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_mov_b32_e32 v3, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, 0x8000 +; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v6, 1 +; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: vec_16xi16_extract_4xi16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; %F +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX9-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-NEXT: s_branch .LBB3_4 +; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: .LBB3_3: ; %T +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX9-NEXT: .LBB3_4: ; %exit +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + br i1 undef, label %T, label %F + +T: + %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0 + br label %exit + +F: + %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1 + br label %exit + +exit: + %m = phi <16 x i16> [ %t, %T ], [ %f, %F ] + %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> + %b2 = icmp sgt <4 x i16> %v2, + %r2 = select <4 x i1> %b2, <4 x i16> , <4 x i16> + ret <4 x i16> %r2 +} + +define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) { +; +; SI-LABEL: vec_16xi16_extract_4xi16_2: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cbranch_scc0 .LBB4_2 +; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB4_3 +; SI-NEXT: s_branch .LBB4_4 +; SI-NEXT: .LBB4_2: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB4_3: ; %T +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: .LBB4_4: ; %exit +; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 +; SI-NEXT: v_mov_b32_e32 v4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, 0x8000 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v7, 1 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: vec_16xi16_extract_4xi16_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 +; GFX9-NEXT: ; %bb.1: ; %F +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: s_branch .LBB4_4 +; GFX9-NEXT: .LBB4_2: +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: .LBB4_3: ; %T +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX9-NEXT: .LBB4_4: ; %exit +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + br i1 undef, label %T, label %F + +T: + %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0 + br label %exit + +F: + %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1 + br label %exit + +exit: + %m = phi <16 x i16> [ %t, %T ], [ %f, %F ] + %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> + %b2 = icmp sgt <4 x i16> %v2, + %r2 = select <4 x i1> %b2, <4 x i16> , <4 x i16> + ret <4 x i16> %r2 +} + +define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16 x half> addrspace(1) * %p1) { +; +; SI-LABEL: vec_16xf16_extract_4xf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cbranch_scc0 .LBB5_2 +; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v4, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB5_3 +; SI-NEXT: s_branch .LBB5_4 +; SI-NEXT: .LBB5_2: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB5_3: ; %T +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: .LBB5_4: ; %exit +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 +; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: vec_16xf16_extract_4xf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %F +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX9-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-NEXT: s_branch .LBB5_4 +; GFX9-NEXT: .LBB5_2: +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s8 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_mov_b32 s15, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: .LBB5_3: ; %T +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX9-NEXT: .LBB5_4: ; %exit +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc +; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v2, vcc +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 +; GFX9-NEXT: s_setpc_b64 s[30:31] + br i1 undef, label %T, label %F + +T: + %t = load volatile <16 x half>, <16 x half> addrspace(1) * %p0 + br label %exit + +F: + %f = load volatile <16 x half>, <16 x half> addrspace(1) * %p1 + br label %exit + +exit: + %m = phi <16 x half> [ %t, %T ], [ %f, %F ] + %v2 = shufflevector <16 x half> %m, <16 x half> undef, <4 x i32> + %b2 = fcmp ugt <4 x half> %v2, + %r2 = select <4 x i1> %b2, <4 x half> , <4 x half> + ret <4 x half> %r2 +} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -179,6 +179,19 @@ ret void } +; GCN-LABEL: {{^}}v_extractelement_v16f16_dynamic_sgpr: +; GCN-COUNT-15: v_cndmask_b32_e32 +define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(half addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %n) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext + %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep + %vec.extract = extractelement <16 x half> %vec, i32 %n + store half %vec.extract, half addrspace(1)* %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -215,6 +215,55 @@ ret void } +; GCN-LABEL: {{^}}v_extractelement_v16i16_2: +; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4 +; SI: buffer_store_short [[RES]] +; VI: flat_load_dword [[RES:v[0-9]+]] +; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] +; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4 +; GFX9: global_store_short v{{[0-9]+}}, [[RES]] +define amdgpu_kernel void @v_extractelement_v16i16_2(i16 addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep + %vec.extract = extractelement <16 x i16> %vec, i32 2 + store i16 %vec.extract, i16 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_extractelement_v16i16_6: +; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12 +; SI: buffer_store_short [[RES]] +; VI: flat_load_dword [[RES:v[0-9]+]] +; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] +; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12 +; GFX9: global_store_short v{{[0-9]+}}, [[RES]] +define amdgpu_kernel void @v_extractelement_v16i16_6(i16 addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep + %vec.extract = extractelement <16 x i16> %vec, i32 6 + store i16 %vec.extract, i16 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_extractelement_v16i16_dynamic_sgpr: +; GCN-COUNT-15: v_cndmask_b32_e32 +define amdgpu_kernel void @v_extractelement_v16i16_dynamic_sgpr(i16 addrspace(1)* %out, <16 x i16> addrspace(1)* %in, i32 %n) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep + %vec.extract = extractelement <16 x i16> %vec, i32 %n + store i16 %vec.extract, i16 addrspace(1)* %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1429,7 +1429,7 @@ ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v18, s3 ; VI-NEXT: v_mov_b32_e32 v17, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x70 +; VI-NEXT: s_add_u32 s2, s0, 0x50 ; VI-NEXT: v_mov_b32_e32 v12, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v11, s0 @@ -1443,12 +1443,12 @@ ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 ; VI-NEXT: v_mov_b32_e32 v14, s3 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; VI-NEXT: v_mov_b32_e32 v13, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x60 +; VI-NEXT: s_add_u32 s2, s0, 64 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] ; VI-NEXT: v_mov_b32_e32 v16, s3 @@ -1459,37 +1459,37 @@ ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: v_mov_b32_e32 v15, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x50 +; VI-NEXT: s_add_u32 s2, s0, 0x70 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] -; VI-NEXT: v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 -; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; VI-NEXT: s_add_u32 s0, s0, 64 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] -; VI-NEXT: v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; VI-NEXT: flat_store_dwordx4 v[13:14], v[3:6] +; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9 +; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 +; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 +; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 +; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; VI-NEXT: s_add_u32 s0, s0, 0x60 +; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 ; VI-NEXT: v_mov_b32_e32 v20, s3 -; VI-NEXT: v_mov_b32_e32 v13, s1 +; VI-NEXT: v_mov_b32_e32 v14, s1 ; VI-NEXT: v_mov_b32_e32 v19, s2 -; VI-NEXT: v_mov_b32_e32 v12, s0 -; VI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[19:20], v[4:7] -; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; VI-NEXT: v_mov_b32_e32 v13, s0 +; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] +; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] +; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8] ; VI-NEXT: s_endpgm %val = load <16 x half>, <16 x half> addrspace(1)* %in %cvt = fpext <16 x half> %val to <16 x double> diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll @@ -20,9 +20,9 @@ ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define amdgpu_kernel void @s_input_output_v16f16() { - %v = tail call <16 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() - tail call void asm sideeffect "; use $0", "s"(<16 x half> %v) +define amdgpu_kernel void @s_input_output_v32f16() { + %v = tail call <32 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() + tail call void asm sideeffect "; use $0", "s"(<32 x half> %v) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll @@ -0,0 +1,93 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: not llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=INVALID %s +; RUN: not llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=INVALID %s + +; GCN-LABEL: {{^}}s_input_output_v8f16 +; GCN: s_mov_b32 s[0:3], -1 +; GCN: ; use s[0:3] +; INVALID: error: couldn't allocate output register for constraint 's' +; INVALID: error: couldn't allocate input reg for constraint 's' +define amdgpu_kernel void @s_input_output_v8f16() { + %v = tail call <8 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() + tail call void asm sideeffect "; use $0", "s"(<8 x half> %v) + ret void +} + +; GCN-LABEL: {{^}}s_input_output_v8i16 +; GCN: s_mov_b32 s[0:3], -1 +; GCN: ; use s[0:3] +; INVALID: error: couldn't allocate output register for constraint 's' +; INVALID: error: couldn't allocate input reg for constraint 's' +define amdgpu_kernel void @s_input_output_v8i16() { + %v = tail call <8 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"() + tail call void asm sideeffect "; use $0", "s"(<8 x i16> %v) + ret void +} + +; GCN-LABEL: {{^}}v_input_output_v8f16 +; GCN: v_mov_b32 v[0:3], -1 +; GCN: ; use v[0:3] +; INVALID: error: couldn't allocate output register for constraint 'v' +; INVALID: error: couldn't allocate input reg for constraint 'v' +define amdgpu_kernel void @v_input_output_v8f16() { + %v = tail call <8 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"() + tail call void asm sideeffect "; use $0", "v"(<8 x half> %v) + ret void +} + +; GCN-LABEL: {{^}}v_input_output_v8i16 +; GCN: v_mov_b32 v[0:3], -1 +; GCN: ; use v[0:3] +; INVALID: error: couldn't allocate output register for constraint 'v' +; INVALID: error: couldn't allocate input reg for constraint 'v' +define amdgpu_kernel void @v_input_output_v8i16() { + %v = tail call <8 x i16> asm sideeffect "v_mov_b32 $0, -1", "=v"() + tail call void asm sideeffect "; use $0", "v"(<8 x i16> %v) + ret void +} + +; GCN-LABEL: {{^}}s_input_output_v16f16 +; GCN: s_mov_b32 s[0:7], -1 +; GCN: ; use s[0:7] +; INVALID: error: couldn't allocate output register for constraint 's' +; INVALID: error: couldn't allocate input reg for constraint 's' +define amdgpu_kernel void @s_input_output_v16f16() { + %v = tail call <16 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() + tail call void asm sideeffect "; use $0", "s"(<16 x half> %v) + ret void +} + +; GCN-LABEL: {{^}}s_input_output_v16i16 +; GCN: s_mov_b32 s[0:7], -1 +; GCN: ; use s[0:7] +; INVALID: error: couldn't allocate output register for constraint 's' +; INVALID: error: couldn't allocate input reg for constraint 's' +define amdgpu_kernel void @s_input_output_v16i16() { + %v = tail call <16 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"() + tail call void asm sideeffect "; use $0", "s"(<16 x i16> %v) + ret void +} + +; GCN-LABEL: {{^}}v_input_output_v16f16 +; GCN: v_mov_b32 v[0:7], -1 +; GCN: ; use v[0:7] +; INVALID: error: couldn't allocate output register for constraint 'v' +; INVALID: error: couldn't allocate input reg for constraint 'v' +define amdgpu_kernel void @v_input_output_v16f16() { + %v = tail call <16 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"() + tail call void asm sideeffect "; use $0", "v"(<16 x half> %v) + ret void +} + +; GCN-LABEL: {{^}}v_input_output_v16i16 +; GCN: v_mov_b32 v[0:7], -1 +; GCN: ; use v[0:7] +; INVALID: error: couldn't allocate output register for constraint 'v' +; INVALID: error: couldn't allocate input reg for constraint 'v' +define amdgpu_kernel void @v_input_output_v16i16() { + %v = tail call <16 x i16> asm sideeffect "v_mov_b32 $0, -1", "=v"() + tail call void asm sideeffect "; use $0", "v"(<16 x i16> %v) + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2059,6 +2059,624 @@ ret void } +define amdgpu_kernel void @v_insertelement_v16f16_3(<16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %val) { +; GFX9-LABEL: v_insertelement_v16f16_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_insertelement_v16f16_3: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v9, s1 +; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: s_lshl_b32 s1, s4, 16 +; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 +; VI-NEXT: v_mov_b32_e32 v12, s1 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_insertelement_v16f16_3: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc +; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] +; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 +; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8 +; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v16f16_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to half + %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 3 + store <16 x half> %vecins, <16 x half> addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @v_insertelement_v16i16_6(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in, i32 %val) { +; GFX9-LABEL: v_insertelement_v16i16_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_bfi_b32 v3, v9, s6, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_insertelement_v16i16_6: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v9, s1 +; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 +; VI-NEXT: s_mov_b32 s2, 0xffff +; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_bfi_b32 v3, s2, v12, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_insertelement_v16i16_6: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 +; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8 +; CI-NEXT: s_mov_b32 s2, 0xffff +; CI-NEXT: v_mov_b32_e32 v12, s4 +; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_bfi_b32 v3, s2, v12, v3 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v16i16_6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %out, i64 %tid.ext + %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to i16 + %vecins = insertelement <16 x i16> %vec, i16 %val.cvt, i32 6 + store <16 x i16> %vecins, <16 x i16> addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @v_insertelement_v16f16_dynamic(<16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %val, i32 %n) { +; GFX9-LABEL: v_insertelement_v16f16_dynamic: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 5 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc +; GFX9-NEXT: s_cmp_eq_u32 s7, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 14 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 13 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 12 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 11 +; GFX9-NEXT: v_lshl_or_b32 v3, v10, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 10 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 9 +; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 8 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v9, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v7, v13, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v6, v12, 16, v6 +; GFX9-NEXT: v_lshl_or_b32 v5, v10, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v4, v11, 16, v4 +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_insertelement_v16f16_dynamic: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v9, s1 +; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 +; VI-NEXT: s_cmp_eq_u32 s7, 14 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; VI-NEXT: v_mov_b32_e32 v12, s6 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 15 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 12 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 13 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 10 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 11 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 8 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s7, 9 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 4 +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 +; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 5 +; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 2 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 3 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 0 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s7, 1 +; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_insertelement_v16f16_dynamic: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; CI-NEXT: s_cmp_eq_u32 s5, 15 +; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 14 +; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 13 +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 12 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 11 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc +; CI-NEXT: v_cndmask_b32_e64 v12, v12, v10, s[2:3] +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 10 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; CI-NEXT: v_or_b32_e32 v2, v2, v12 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; CI-NEXT: v_or_b32_e32 v1, v1, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 +; CI-NEXT: s_cmp_eq_u32 s5, 9 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 8 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v10, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 6 +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 5 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 4 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v10, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; CI-NEXT: v_or_b32_e32 v3, v3, v11 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; CI-NEXT: v_or_b32_e32 v0, v0, v12 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_or_b32_e32 v7, v7, v12 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_or_b32_e32 v6, v6, v12 +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; CI-NEXT: s_cmp_eq_u32 s5, 3 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 1 +; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 0 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cndmask_b32_e32 v12, v12, v10, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; CI-NEXT: v_or_b32_e32 v5, v5, v10 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; CI-NEXT: v_or_b32_e32 v4, v4, v10 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v8 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v16f16_dynamic: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 +; GFX11-NEXT: s_cmp_eq_u32 s1, 7 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s3 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 3 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 2 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s0, s3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v3, v9, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 15 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 14 +; GFX11-NEXT: v_lshl_or_b32 v2, v10, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 13 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 12 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 11 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 10 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 9 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 8 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s3 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s0, s2 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s0, s6 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v7, v10, 16, v7 +; GFX11-NEXT: v_lshl_or_b32 v6, v12, 16, v6 +; GFX11-NEXT: v_lshl_or_b32 v5, v13, 16, v5 +; GFX11-NEXT: v_lshl_or_b32 v4, v14, 16, v4 +; GFX11-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to half + %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 %n + store <16 x half> %vecins, <16 x half> addrspace(1)* %out.gep + ret void +} + + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -555,21 +555,21 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) ; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v0, v8 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v8, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v1, v9 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v9, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v16, v10 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v10, v16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v17, v11 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v11, v17 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v4, v12 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v12, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v5, v13 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v13, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v18, v14 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v14, v18 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v19, v15 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v15, v19 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -2282,20 +2282,20 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s15, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s13, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s9, 16 @@ -2303,8 +2303,8 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s7, 16 @@ -2312,8 +2312,8 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 16 @@ -2321,8 +2321,8 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s3, 16 @@ -2330,8 +2330,8 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s1, 16 @@ -2339,8 +2339,8 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s0, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -3242,157 +3242,159 @@ ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s31, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s69, s15, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s1, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s52, s1, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s0, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s54, s0, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s3, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s56, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s58, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s67, s13, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s12, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s69, s31, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s30, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s0, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s60, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s61, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s62, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s63, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s67, s29, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s28, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s27, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s26, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s27, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s25, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s24, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s25, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s23, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s22, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s21, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s20, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s21, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s19, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s18, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s30, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s17, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s16, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s15, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s29, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s13, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s28, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s27, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s11, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s27, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s26, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s25, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s9, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s25, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s24, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s23, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s23, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s22, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s21, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s21, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s20, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s18, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, 0xffff -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s61 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -4865,16 +4867,16 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5558,30 +5560,30 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s11, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s10, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s9, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s8, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s8, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s10, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s7, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s6, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s5, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 @@ -5882,57 +5884,57 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[18:19], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s1, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 @@ -5941,10 +5943,10 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 @@ -6316,87 +6318,87 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s15, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s14, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s13, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s12, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s11, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s9, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s10, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s9, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s8, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s7, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s6, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s5, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s3, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s4, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s3, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s2, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s1, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s0, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s12, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s2, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -6920,153 +6922,134 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s64, s11 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s80, s15 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s82, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s0, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s5 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s7 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s7, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s56, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s78, s14, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[34:35], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[46:47], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[48:49], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[62:63], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[64:65], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[80:81], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[82:83], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s15 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s13 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[82:83], s[14:15], 48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s11 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s3 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s0, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s2, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s3 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s3, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s72, s13 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s74, s13, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[76:77], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[44:45], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[58:59], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[78:79], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s12, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[40:41], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[54:55], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[72:73], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[74:75], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s82 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s83 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s9 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s11, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s76 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s77 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[38:39], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[52:53], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[70:71], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[50:51], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s81 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s78 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s79 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s76 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s77 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s74 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s75 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s14, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s73 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s12, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -736,21 +736,21 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8) ; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v19, 16, v7 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v0, v8 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v8, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v1, v9 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v9, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v16, v10 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v10, v16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v17, v11 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v11, v17 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v4, v12 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v12, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v5, v13 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v13, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v18, v14 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v14, v18 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v19, v15 +; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v15, v19 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -2783,45 +2783,45 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -3805,108 +3805,135 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: +; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v18 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v27 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v26 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v25 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v40 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v39 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v42 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v41 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v40 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v26 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v26 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v24 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v56 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v60, 16, v41 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v61, 0xffff, v42 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v59, 0xffff, v41 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v40 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v58 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v57 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v58 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v57 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v35 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v23 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v22 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v21 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v19 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v19 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xffff, v18 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5792,19 +5819,19 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v4i16_to_v4i64: @@ -6639,15 +6666,9 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 @@ -6655,31 +6676,37 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v28 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v28 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -7029,40 +7056,39 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 @@ -7074,7 +7100,7 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -7568,95 +7594,95 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[30:33], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[34:37], off, s[8:11], 0 offset:48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v59, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v57 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v33 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v33 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v38 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v37 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v37 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v57 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v58, 16, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v57 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v57 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i16_to_v32i64: @@ -8293,113 +8319,99 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[12:13] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v13, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[0:1] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[4:5] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[8:9] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v9, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[14:15] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v3 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[17:18], 48, v[2:3] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v7 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[17:18], 48, v[6:7] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[17:18], 48, v[10:11] +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i16_to_v32i64: