Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4578,6 +4578,19 @@ return SDValue(); } +static unsigned workitemIntrinsicDim(unsigned ID) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_x: + return 0; + case Intrinsic::amdgcn_workitem_id_y: + return 1; + case Intrinsic::amdgcn_workitem_id_z: + return 2; + default: + llvm_unreachable("not a workitem intrinsic"); + } +} + void AMDGPUTargetLowering::computeKnownBitsForTargetNode( const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { @@ -4714,6 +4727,14 @@ Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2()); break; } + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: { + unsigned MaxValue = Subtarget->getMaxWorkitemID( + DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID)); + Known.Zero.setHighBits(countLeadingZeros(MaxValue)); + break; + } default: break; } Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -96,9 +96,13 @@ const TargetRegisterClass *ArgRC, LLT ArgTy) const; bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizePreloadedArgIntrin( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizeWorkitemIDIntrinsic( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const; bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3004,6 +3004,53 @@ return true; } +static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, + int64_t C) { + B.buildConstant(MI.getOperand(0).getReg(), C); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); + if (MaxID == 0) + return replaceWithConstant(B, MI, 0); + + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + const ArgDescriptor *Arg; + const TargetRegisterClass *ArgRC; + LLT ArgTy; + std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); + + Register DstReg = MI.getOperand(0).getReg(); + if (!Arg) { + // It's undefined behavior if a function marked with the amdgpu-no-* + // attributes uses the corresponding intrinsic. + B.buildUndef(DstReg); + MI.eraseFromParent(); + return true; + } + + if (Arg->isMasked()) { + // Don't bother inserting AssertZext for packed IDs since we're emitting the + // masking operations anyway. + // + // TODO: We could assert the top bit is 0 for the source copy. + if (!loadInputValue(DstReg, B, ArgType)) + return false; + } else { + Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + if (!loadInputValue(TmpReg, B, ArgType)) + return false; + B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID)); + } + + MI.eraseFromParent(); + return true; +} + Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const { LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); @@ -5072,12 +5119,6 @@ return true; } -static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { - B.buildConstant(MI.getOperand(0).getReg(), C); - MI.eraseFromParent(); - return true; -} - bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const { unsigned Opc; @@ -5202,22 +5243,14 @@ case Intrinsic::amdgcn_implicitarg_ptr: return legalizeImplicitArgPtr(MI, MRI, B); case Intrinsic::amdgcn_workitem_id_x: - if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) - return replaceWithConstant(B, MI, 0); - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::WORKITEM_ID_X); + return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, + AMDGPUFunctionArgInfo::WORKITEM_ID_X); case Intrinsic::amdgcn_workitem_id_y: - if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) - return replaceWithConstant(B, MI, 0); - - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y); case Intrinsic::amdgcn_workitem_id_z: - if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) - return replaceWithConstant(B, MI, 0); - - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z); case Intrinsic::amdgcn_workgroup_id_x: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_X); Index: llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -156,11 +156,8 @@ Changed = true; break; - case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::r600_read_tidig_x: - case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: - case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: case Intrinsic::r600_read_local_size_x: case Intrinsic::r600_read_local_size_y: Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -79,6 +79,9 @@ SDValue lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const; + SDValue lowerWorkitemID(SelectionDAG &DAG, SDValue Op, unsigned Dim, + const ArgDescriptor &ArgDesc) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6770,6 +6770,32 @@ return Loads[0]; } +SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, + unsigned Dim, + const ArgDescriptor &Arg) const { + SDLoc SL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim); + if (MaxID == 0) + return DAG.getConstant(0, SL, MVT::i32); + + SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), Arg); + + // Don't bother inserting AssertZext for packed IDs since we're emitting the + // masking operations anyway. + // + // TODO: We could assert the top bit is 0 for the source copy. + if (Arg.isMasked()) + return Val; + + // Preserve the known bits after expansion to a copy. + EVT SmallVT = + EVT::getIntegerVT(*DAG.getContext(), 32 - countLeadingZeros(MaxID)); + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val, + DAG.getValueType(SmallVT)); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -6916,26 +6942,11 @@ return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); case Intrinsic::amdgcn_workitem_id_x: - if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0) - return DAG.getConstant(0, DL, MVT::i32); - - return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, - SDLoc(DAG.getEntryNode()), - MFI->getArgInfo().WorkItemIDX); + return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX); case Intrinsic::amdgcn_workitem_id_y: - if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0) - return DAG.getConstant(0, DL, MVT::i32); - - return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, - SDLoc(DAG.getEntryNode()), - MFI->getArgInfo().WorkItemIDY); + return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: - if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0) - return DAG.getConstant(0, DL, MVT::i32); - - return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, - SDLoc(DAG.getEntryNode()), - MFI->getArgInfo().WorkItemIDZ); + return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ); case Intrinsic::amdgcn_wavefrontsize: return DAG.getConstant(MF.getSubtarget().getWavefrontSize(), SDLoc(Op), MVT::i32); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.workitem.id.mir @@ -0,0 +1,159 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -run-pass=legalizer -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @test_workitem_id_x_unpacked() !reqd_work_group_size !0 { + ret void + } + + define amdgpu_kernel void @test_workitem_id_y_unpacked() !reqd_work_group_size !0 { + ret void + } + + define amdgpu_kernel void @test_workitem_id_z_unpacked() !reqd_work_group_size !0 { + ret void + } + + define amdgpu_kernel void @test_workitem_id_x_packed() !reqd_work_group_size !0 { + ret void + } + + define amdgpu_kernel void @test_workitem_id_y_packed() !reqd_work_group_size !0 { + ret void + } + + define amdgpu_kernel void @test_workitem_id_z_packed() !reqd_work_group_size !0 { + ret void + } + + define amdgpu_kernel void @missing_arg_info() "amdgpu-no-workitem-id-x" { + ret void + } + + !0 = !{i32 256, i32 8, i32 4} +... +--- +name: test_workitem_id_x_unpacked +machineFunctionInfo: + argumentInfo: + workGroupIDX: { reg: '$sgpr2' } + workItemIDX: { reg: '$vgpr0' } + workItemIDY: { reg: '$vgpr1' } + workItemIDZ: { reg: '$vgpr2' } +body: | + bb.0: + ; GCN-LABEL: name: test_workitem_id_x_unpacked + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY1]], 8 + ; GCN-NEXT: S_ENDPGM 0, implicit [[ASSERT_ZEXT]](s32) + %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) + S_ENDPGM 0, implicit %0 +... + +--- +name: test_workitem_id_y_unpacked +machineFunctionInfo: + argumentInfo: + workGroupIDX: { reg: '$sgpr2' } + workItemIDX: { reg: '$vgpr0' } + workItemIDY: { reg: '$vgpr1' } + workItemIDZ: { reg: '$vgpr2' } +body: | + bb.0: + ; GCN-LABEL: name: test_workitem_id_y_unpacked + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY1]], 3 + ; GCN-NEXT: S_ENDPGM 0, implicit [[ASSERT_ZEXT]](s32) + %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.y) + S_ENDPGM 0, implicit %0 +... + +--- +name: test_workitem_id_z_unpacked +machineFunctionInfo: + argumentInfo: + workGroupIDX: { reg: '$sgpr2' } + workItemIDX: { reg: '$vgpr0' } + workItemIDY: { reg: '$vgpr1' } + workItemIDZ: { reg: '$vgpr2' } +body: | + bb.0: + ; GCN-LABEL: name: test_workitem_id_z_unpacked + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY1]], 2 + ; GCN-NEXT: S_ENDPGM 0, implicit [[ASSERT_ZEXT]](s32) + %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.z) + S_ENDPGM 0, implicit %0 +... + +--- +name: test_workitem_id_x_packed +machineFunctionInfo: + argumentInfo: + workItemIDX: { reg: '$vgpr0', mask: 1023 } + workItemIDY: { reg: '$vgpr0', mask: 1047552 } + workItemIDZ: { reg: '$vgpr0', mask: 1072693248 } +body: | + bb.0: + ; GCN-LABEL: name: test_workitem_id_x_packed + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1023 + ; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[AND]](s32) + %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) + S_ENDPGM 0, implicit %0 +... + +--- +name: test_workitem_id_y_packed +machineFunctionInfo: + argumentInfo: + workItemIDX: { reg: '$vgpr0', mask: 1023 } + workItemIDY: { reg: '$vgpr0', mask: 1047552 } + workItemIDZ: { reg: '$vgpr0', mask: 1072693248 } +body: | + bb.0: + ; GCN-LABEL: name: test_workitem_id_y_packed + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1023 + ; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[AND]](s32) + %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.y) + S_ENDPGM 0, implicit %0 +... + +--- +name: test_workitem_id_z_packed +machineFunctionInfo: + argumentInfo: + workItemIDX: { reg: '$vgpr0', mask: 1023 } + workItemIDY: { reg: '$vgpr0', mask: 1047552 } + workItemIDZ: { reg: '$vgpr0', mask: 1072693248 } +body: | + bb.0: + ; GCN-LABEL: name: test_workitem_id_z_packed + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1023 + ; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[AND]](s32) + %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.z) + S_ENDPGM 0, implicit %0 +... + +--- +name: missing_arg_info +body: | + bb.0: + ; GCN-LABEL: name: missing_arg_info + ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN-NEXT: S_ENDPGM 0, implicit [[DEF]](s32) + %0:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.z) + S_ENDPGM 0, implicit %0 +... Index: llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -787,14 +787,10 @@ ; GFX9-LABEL: load_i8_to_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -863,15 +859,11 @@ ; GFX9-LABEL: load_v2i8_to_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 @@ -945,14 +937,10 @@ ; GFX9-LABEL: load_v3i8_to_v3f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 @@ -1030,14 +1018,10 @@ ; GFX9-LABEL: load_v4i8_to_v4f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 @@ -1145,28 +1129,24 @@ ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ubyte v2, v[0:1], off offset:3 -; GFX9-NEXT: global_load_ubyte v4, v[0:1], off offset:2 -; GFX9-NEXT: global_load_ubyte v5, v[0:1], off offset:1 -; GFX9-NEXT: global_load_ubyte v6, v[0:1], off +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[0:1] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid @@ -1296,15 +1276,11 @@ ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2c ; GFX9-NEXT: s_movk_i32 s4, 0x900 @@ -1458,37 +1434,33 @@ ; GFX9-LABEL: load_v7i8_to_v7f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ushort v2, v[0:1], off offset:4 -; GFX9-NEXT: global_load_ubyte v3, v[0:1], off offset:6 -; GFX9-NEXT: global_load_ubyte v4, v[0:1], off offset:3 -; GFX9-NEXT: global_load_ubyte v5, v[0:1], off offset:2 -; GFX9-NEXT: global_load_ubyte v7, v[0:1], off offset:1 -; GFX9-NEXT: global_load_ubyte v8, v[0:1], off +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] offset:3 +; GFX9-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2 +; GFX9-NEXT: global_load_ubyte v5, v0, s[0:1] offset:1 +; GFX9-NEXT: global_load_ubyte v7, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v3 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v5 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v7 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v10 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v10 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] -; GFX9-NEXT: global_store_dwordx3 v9, v[4:6], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid @@ -1574,14 +1546,10 @@ ; GFX9-LABEL: load_v8i8_to_v8f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[7:8], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 @@ -1659,15 +1627,11 @@ ; GFX9-LABEL: i8_zext_inreg_i32_to_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 @@ -1736,15 +1700,11 @@ ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1811,14 +1771,10 @@ ; GFX9-LABEL: i8_zext_i32_to_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1919,28 +1875,24 @@ ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_ubyte v2, v[0:1], off offset:3 -; GFX9-NEXT: global_load_ubyte v4, v[0:1], off offset:2 -; GFX9-NEXT: global_load_ubyte v5, v[0:1], off offset:1 -; GFX9-NEXT: global_load_ubyte v6, v[0:1], off +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[0:1] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid @@ -2003,15 +1955,11 @@ ; GFX9-LABEL: extract_byte0_to_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2078,15 +2026,11 @@ ; GFX9-LABEL: extract_byte1_to_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2154,15 +2098,11 @@ ; GFX9-LABEL: extract_byte2_to_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2230,15 +2170,11 @@ ; GFX9-LABEL: extract_byte3_to_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2313,14 +2249,10 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2c -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 Index: llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -43,7 +43,7 @@ ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s0, s0, 4 @@ -76,7 +76,7 @@ ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_add_i32 s0, s0, 4 @@ -142,7 +142,7 @@ ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s0, s0, 1 @@ -176,7 +176,7 @@ ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s0, s0, 1 @@ -243,7 +243,7 @@ ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: v_mul_u32_u24_e32 v0, 2, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 @@ -277,7 +277,7 @@ ; GFX940: ; %bb.0: ; %bb ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v1, 1 -; GFX940-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 Index: llvm/test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -151,8 +151,8 @@ ; GCN-LABEL: mubuf_clause: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v31 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff0, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 ; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:12 ; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8 @@ -211,8 +211,8 @@ ; GCN-SCRATCH: ; %bb.0: ; %bb ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-SCRATCH-NEXT: v_and_b32_e32 v2, 0x3ff, v31 -; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v18, 4, v2 +; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v2, 4, v31 +; GCN-SCRATCH-NEXT: v_and_b32_e32 v18, 0x3ff0, v2 ; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18 ; GCN-SCRATCH-NEXT: s_clause 0x3 ; GCN-SCRATCH-NEXT: scratch_load_dwordx4 v[2:5], v0, off Index: llvm/test/CodeGen/AMDGPU/zext-lid.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/zext-lid.ll +++ llvm/test/CodeGen/AMDGPU/zext-lid.ll @@ -1,14 +1,8 @@ ; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,O2 %s ; RUN: llc -O0 -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s ; GCN-LABEL: {{^}}zext_grp_size_128: ; GCN-NOT: and_b32 - -; OPT-LABEL: @zext_grp_size_128 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !0 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !0 define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -27,11 +21,6 @@ ; GCN-LABEL: {{^}}zext_grp_size_32x4x1: ; GCN-NOT: and_b32 - -; OPT-LABEL: @zext_grp_size_32x4x1 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !3 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !4 define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -53,8 +42,6 @@ ; When EarlyCSE is not run this call produces a range max with 0 active bits, ; which is a special case as an AssertZext from width 0 is invalid. -; OPT-LABEL: @zext_grp_size_1x1x1 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !4 define amdgpu_kernel void @zext_grp_size_1x1x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !1 { %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = and i32 %tmp, 1 @@ -64,11 +51,6 @@ ; GCN-LABEL: {{^}}zext_grp_size_512: ; GCN-NOT: and_b32 - -; OPT-LABEL: @zext_grp_size_512 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !6 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !6 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !6 define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -89,9 +71,6 @@ ; O2-NOT: and_b32 ; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff, ; O2-NOT: and_b32 - -; OPT-LABEL: @func_test_workitem_id_x_known_max_range( -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 define void @func_test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -104,9 +83,6 @@ ; O2-NOT: and_b32 ; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff, ; O2-NOT: and_b32 - -; OPT-LABEL: @func_test_workitem_id_x_default_range( -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !7 define void @func_test_workitem_id_x_default_range(i32 addrspace(1)* nocapture %out) #4 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -129,12 +105,3 @@ !0 = !{i32 32, i32 4, i32 1} !1 = !{i32 1, i32 1, i32 1} - -; OPT: !0 = !{i32 0, i32 128} -; OPT: !1 = !{i32 32, i32 4, i32 1} -; OPT: !2 = !{i32 0, i32 32} -; OPT: !3 = !{i32 0, i32 4} -; OPT: !4 = !{i32 0, i32 1} -; OPT: !5 = !{i32 1, i32 1, i32 1} -; OPT: !6 = !{i32 0, i32 512} -; OPT: !7 = !{i32 0, i32 1024}