diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -172,6 +172,9 @@ bool matchCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo); void applyCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo); + /// Match (and (load x), mask) -> zextload x + bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Combine \p MI into a pre-indexed or post-indexed load/store operation if /// legal and the surrounding code makes it useful. bool tryCombineIndexedLoadStore(MachineInstr &MI); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -130,7 +130,13 @@ (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD):$root, [{ return Helper.matchCombineExtendingLoads(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyCombineExtendingLoads(*${root}, ${matchinfo}); }])>; -def combines_for_extload: GICombineGroup<[extending_loads]>; + +def load_and_mask : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_AND):$root, + [{ return Helper.matchCombineLoadWithAndMask(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +def combines_for_extload: GICombineGroup<[extending_loads, load_and_mask]>; def sext_trunc_sextload : GICombineRule< (defs root:$d), diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -633,6 +633,76 @@ Observer.changedInstr(MI); } +bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI, + BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_AND); + + // If we have the following code: + // %mask = G_CONSTANT 255 + // %ld = G_LOAD %ptr, (load s16) + // %and = G_AND %ld, %mask + // + // Try to fold it into + // %ld = G_ZEXTLOAD %ptr, (load s8) + + Register Dst = MI.getOperand(0).getReg(); + if (MRI.getType(Dst).isVector()) + return false; + + auto MaybeMask = + getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + if (!MaybeMask) + return false; + + APInt MaskVal = MaybeMask->Value; + + if (!MaskVal.isMask()) + return false; + + Register SrcReg = MI.getOperand(1).getReg(); + GAnyLoad *LoadMI = getOpcodeDef(SrcReg, MRI); + if (!LoadMI || !MRI.hasOneNonDBGUse(LoadMI->getDstReg()) || + !LoadMI->isSimple()) + return false; + + Register LoadReg = LoadMI->getDstReg(); + LLT LoadTy = MRI.getType(LoadReg); + Register PtrReg = LoadMI->getPointerReg(); + uint64_t LoadSizeBits = LoadMI->getMemSizeInBits(); + unsigned MaskSizeBits = MaskVal.countTrailingOnes(); + + // The mask may not be larger than the in-memory type, as it might cover sign + // extended bits + if (MaskSizeBits > LoadSizeBits) + return false; + + // If the mask covers the whole destination register, there's nothing to + // extend + if (MaskSizeBits >= LoadTy.getSizeInBits()) + return false; + + // Most targets cannot deal with loads of size < 8 and need to re-legalize to + // at least byte loads. Avoid creating such loads here + if (MaskSizeBits < 8 || !isPowerOf2_32(MaskSizeBits)) + return false; + + const MachineMemOperand &MMO = LoadMI->getMMO(); + LegalityQuery::MemDesc MemDesc(MMO); + MemDesc.MemoryTy = LLT::scalar(MaskSizeBits); + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_ZEXTLOAD, {LoadTy, MRI.getType(PtrReg)}, {MemDesc}})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.setInstrAndDebugLoc(*LoadMI); + auto &MF = B.getMF(); + auto PtrInfo = MMO.getPointerInfo(); + auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, MaskSizeBits / 8); + B.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, Dst, PtrReg, *NewMMO); + }; + return true; +} + bool CombinerHelper::isPredecessor(const MachineInstr &DefMI, const MachineInstr &UseMI) { assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() && diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-and-mask.mir @@ -0,0 +1,252 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -aarch64prelegalizercombinerhelper-only-enable-rule="load_and_mask" -verify-machineinstrs %s -o - | FileCheck %s + +# REQUIRES: asserts + +# Check that we can fold and ({any,zext,sext}load, mask) -> zextload + +--- +name: test_anyext_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_anyext_1 + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1 + ; CHECK: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8)) + ; CHECK: [[AND:%[0-9]+]]:_(s8) = G_AND [[LOAD]], [[C]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s8) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + %0:_(p0) = COPY $x0 + %1:_(s8) = G_CONSTANT i8 1 + %2:_(s8) = G_LOAD %0 :: (load (s8)) + %3:_(s8) = G_AND %2, %1 + %4:_(s32) = G_ANYEXT %3 + $w0 = COPY %4 +... + +--- +name: test_anyext_s16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_anyext_s16 + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s16) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ZEXTLOAD]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + %0:_(p0) = COPY $x0 + %1:_(s16) = G_CONSTANT i16 255 + %2:_(s16) = G_LOAD %0 :: (load (s8)) + %3:_(s16) = G_AND %2, %1 + %4:_(s32) = G_ANYEXT %3 + $w0 = COPY %4 +... + +--- +name: test_anyext_s32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_anyext_s32 + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) + ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_LOAD %0 :: (load (s8)) + %3:_(s32) = G_AND %2, %1 + $w0 = COPY %3 +... + +--- +name: test_load_s32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_load_s32 + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8), align 4) + ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_LOAD %0 :: (load (s32)) + %3:_(s32) = G_AND %2, %1 + $w0 = COPY %3 +... + + +--- +name: test_load_mask_size_equals_dst_size +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; The combine should only apply if the mask zeroes actual bits of the dst type + ; If it doesn't, the mask is redundant and we have other combines to fold it away + + ; CHECK-LABEL: name: test_load_mask_size_equals_dst_size + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32)) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]] + ; CHECK: $w0 = COPY [[AND]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 4294967295 + %2:_(s32) = G_LOAD %0 :: (load (s32)) + %3:_(s32) = G_AND %2, %1 + $w0 = COPY %3 +... + +--- +name: test_zext +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_zext + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8), align 2) + ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_ZEXTLOAD %0 :: (load (s16)) + %3:_(s32) = G_AND %2, %1 + $w0 = COPY %3 +... + +--- +name: test_zext_mask_larger_memsize +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; The combine should only apply if the mask narrows the memory size. + ; We have another combine that folds redundant masks + + ; CHECK-LABEL: name: test_zext_mask_larger_memsize + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ZEXTLOAD]], [[C]] + ; CHECK: $w0 = COPY [[AND]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 65535 + %2:_(s32) = G_ZEXTLOAD %0 :: (load (s8)) + %3:_(s32) = G_AND %2, %1 + $w0 = COPY %3 +... + +--- +name: test_sext +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_sext + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8), align 2) + ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_SEXTLOAD %0 :: (load (s16)) + %3:_(s32) = G_AND %2, %1 + $w0 = COPY %3 +... + +--- +name: test_sext_mask_larger_memsize +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_sext_mask_larger_memsize + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load (s8)) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXTLOAD]], [[C]] + ; CHECK: $w0 = COPY [[AND]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 65535 + %2:_(s32) = G_SEXTLOAD %0 :: (load (s8)) + %3:_(s32) = G_AND %2, %1 + $w0 = COPY %3 +... + +--- +name: test_non_pow2_memtype +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_non_pow2_memtype + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s24) = G_CONSTANT i24 7 + ; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[COPY]](p0) :: (load (s24), align 4) + ; CHECK: [[AND:%[0-9]+]]:_(s24) = G_AND [[LOAD]], [[C]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s24) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + %0:_(p0) = COPY $x0 + %1:_(s24) = G_CONSTANT i24 7 + %2:_(s24) = G_LOAD %0 :: (load (s24)) + %3:_(s24) = G_AND %2, %1 + %4:_(s32) = G_ANYEXT %3 + $w0 = COPY %4 +... + + +--- +name: test_no_mask +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_no_mask + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 510 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8)) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]] + ; CHECK: $w0 = COPY [[AND]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 510 + %2:_(s32) = G_LOAD %0 :: (load (s8)) + %3:_(s32) = G_AND %2, %1 + $w0 = COPY %3 +... + +--- +name: test_volatile +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_volatile + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (volatile load (s8)) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]] + ; CHECK: $w0 = COPY [[AND]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_LOAD %0 :: (volatile load (s8)) + %3:_(s32) = G_AND %2, %1 + $w0 = COPY %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -462,7 +462,6 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -479,7 +478,7 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -538,22 +537,17 @@ ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_movk_i32 s0, 0xff +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, s0, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, s0, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, s0, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, s0, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -580,13 +574,13 @@ ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -784,22 +778,17 @@ ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_movk_i32 s0, 0xff +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, s0, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, s0, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, s0, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, s0, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -826,13 +815,13 @@ ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -854,11 +843,10 @@ ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -873,9 +861,9 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -30,40 +30,27 @@ ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[0:1], off offset:9 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off offset:10 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[0:1], off offset:11 -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v3, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 8, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, s4, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v9 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v3, v4, v5 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v12, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v13, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, s4, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v8, v10, v0, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v12 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v4 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v10 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v13 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v6, v7, v8 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -94,40 +81,23 @@ ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:9 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:10 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11 -; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff -; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 0xff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v2, v1 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v6, v5 @@ -160,19 +130,12 @@ ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[0:1], off offset:6 ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[0:1], off offset:8 ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[0:1], off offset:10 -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v2, s4, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v6, s4, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2: @@ -197,25 +160,15 @@ ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v2, v1 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v3 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v6, v5 ; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 2 ret <3 x i32> %load @@ -405,43 +358,30 @@ ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] offset:11 -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v2, 8, v1 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v1, s0, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v6, 8, v5 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v3, v4, v5 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v9 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v5, s0, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v9, v0, v10 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v1, v2, v3 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v4, v5, v6 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v7, v8 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v2 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v6, v7, v8 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1: @@ -471,41 +411,26 @@ ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11 -; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff -; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v12 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v12 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v12 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v12 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v12 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v8, v9 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v6 @@ -541,21 +466,14 @@ ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:6 ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:8 ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v0, s[0:1] offset:10 -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v2 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v4 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v1, s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v3, s0, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, s0, v4 +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX9-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; @@ -580,24 +498,14 @@ ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:8 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:10 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v5 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -39,141 +39,106 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_u8 v1, v0 ; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX9-NEXT: ds_read_u8 v4, v0 offset:2 -; GFX9-NEXT: ds_read_u8 v5, v0 offset:3 -; GFX9-NEXT: ds_read_u8 v6, v0 offset:4 -; GFX9-NEXT: ds_read_u8 v7, v0 offset:5 -; GFX9-NEXT: ds_read_u8 v8, v0 offset:6 -; GFX9-NEXT: ds_read_u8 v9, v0 offset:7 -; GFX9-NEXT: s_mov_b32 s5, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX9-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX9-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX9-NEXT: ds_read_u8 v8, v0 offset:7 ; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(5) -; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v4, v1, v2, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX9-NEXT: v_or3_b32 v4, v1, v2, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 ; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v2, v8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v5, v9, v3 -; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v1, v2, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v2, v3 ; GFX9-NEXT: ds_read_u8 v2, v0 offset:8 -; GFX9-NEXT: ds_read_u8 v6, v0 offset:9 -; GFX9-NEXT: ds_read_u8 v7, v0 offset:10 -; GFX9-NEXT: ds_read_u8 v8, v0 offset:11 -; GFX9-NEXT: ds_read_u8 v9, v0 offset:12 -; GFX9-NEXT: ds_read_u8 v10, v0 offset:13 -; GFX9-NEXT: ds_read_u8 v11, v0 offset:14 +; GFX9-NEXT: ds_read_u8 v3, v0 offset:9 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX9-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX9-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX9-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX9-NEXT: ds_read_u8 v9, v0 offset:14 ; GFX9-NEXT: ds_read_u8 v0, v0 offset:15 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v2, v2, v3, v6 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 8, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(5) -; GFX9-NEXT: v_and_b32_e32 v6, v7, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_and_b32_e32 v7, v8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_or3_b32 v2, v2, v6, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX9-NEXT: v_or3_b32 v2, v2, v3, v5 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v3, v8, 8, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v6, v11, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX9-NEXT: v_and_or_b32 v5, v9, v3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX9-NEXT: v_or3_b32 v3, v5, v6, v0 +; GFX9-NEXT: v_or3_b32 v3, v3, v5, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:7 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xff -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v2, v7, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v2, v8, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, v9, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:10 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:12 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:13 -; GFX7-NEXT: ds_read_u8 v10, v0 offset:14 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_and_b32_e32 v5, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_and_b32_e32 v5, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v6, v9, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_and_b32_e32 v5, v8, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, v10, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -181,63 +146,45 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX10-NEXT: ds_read_u8 v2, v0 offset:2 -; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 -; GFX10-NEXT: ds_read_u8 v4, v0 offset:5 -; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 -; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v8, v0 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:4 -; GFX10-NEXT: ds_read_u8 v10, v0 offset:8 -; GFX10-NEXT: ds_read_u8 v12, v0 offset:10 -; GFX10-NEXT: ds_read_u8 v13, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:12 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:13 -; GFX10-NEXT: ds_read_u8 v16, v0 offset:14 +; GFX10-NEXT: ds_read_u8 v1, v0 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 +; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 ; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 -; GFX10-NEXT: v_mov_b32_e32 v17, 8 -; GFX10-NEXT: s_mov_b32 s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v11, 0xff -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_waitcnt lgkmcnt(15) -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(14) -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(13) -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(12) -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_waitcnt lgkmcnt(11) -; GFX10-NEXT: v_and_b32_e32 v5, v5, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(10) -; GFX10-NEXT: v_and_b32_e32 v6, v6, v11 +; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(7) -; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_and_b32_e32 v8, v12, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 -; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshl_or_b32 v10, v14, 8, v13 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v12, v16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 ; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 @@ -252,7 +199,6 @@ ; GFX9-LABEL: load_lds_v4i32_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: ds_read_u16 v1, v0 ; GFX9-NEXT: ds_read_u16 v2, v0 offset:2 ; GFX9-NEXT: ds_read_u16 v3, v0 offset:4 @@ -262,27 +208,18 @@ ; GFX9-NEXT: ds_read_u16 v7, v0 offset:12 ; GFX9-NEXT: ds_read_u16 v8, v0 offset:14 ; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_and_b32_e32 v0, s4, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v2, s4, v6 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, s4, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2 -; GFX9-NEXT: v_and_or_b32 v3, v7, s4, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:2 ; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 @@ -291,63 +228,40 @@ ; GFX7-NEXT: ds_read_u16 v6, v0 offset:10 ; GFX7-NEXT: ds_read_u16 v7, v0 offset:12 ; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: s_waitcnt lgkmcnt(7) -; GFX7-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: load_lds_v4i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 -; GFX10-NEXT: ds_read_u16 v2, v0 offset:6 -; GFX10-NEXT: ds_read_u16 v3, v0 offset:10 -; GFX10-NEXT: ds_read_u16 v4, v0 offset:14 -; GFX10-NEXT: ds_read_u16 v5, v0 -; GFX10-NEXT: ds_read_u16 v6, v0 offset:4 -; GFX10-NEXT: ds_read_u16 v7, v0 offset:8 -; GFX10-NEXT: ds_read_u16 v8, v0 offset:12 -; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: s_waitcnt lgkmcnt(7) -; GFX10-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX10-NEXT: ds_read_u16 v1, v0 +; GFX10-NEXT: ds_read_u16 v2, v0 offset:2 +; GFX10-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX10-NEXT: ds_read_u16 v4, v0 offset:6 +; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 +; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 +; GFX10-NEXT: ds_read_u16 v7, v0 offset:12 +; GFX10-NEXT: ds_read_u16 v8, v0 offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(3) -; GFX10-NEXT: v_and_or_b32 v0, v5, s4, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_and_or_b32 v1, v6, s4, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_and_or_b32 v2, v7, s4, v2 +; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v3, v8, s4, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 ret <4 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -38,112 +38,83 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_u8 v1, v0 -; GFX9-NEXT: ds_read_u8 v3, v0 offset:1 -; GFX9-NEXT: ds_read_u8 v4, v0 offset:2 -; GFX9-NEXT: ds_read_u8 v5, v0 offset:3 -; GFX9-NEXT: ds_read_u8 v6, v0 offset:4 -; GFX9-NEXT: ds_read_u8 v7, v0 offset:5 -; GFX9-NEXT: ds_read_u8 v8, v0 offset:6 -; GFX9-NEXT: ds_read_u8 v9, v0 offset:7 -; GFX9-NEXT: s_mov_b32 s5, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX9-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX9-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX9-NEXT: ds_read_u8 v8, v0 offset:7 ; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(5) -; GFX9-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v3, v1, v3, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX9-NEXT: v_or3_b32 v3, v1, v2, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 ; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v4, v8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v5, v9, v2 -; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v1, v4, v5 -; GFX9-NEXT: ds_read_u8 v4, v0 offset:8 -; GFX9-NEXT: ds_read_u8 v5, v0 offset:9 -; GFX9-NEXT: ds_read_u8 v6, v0 offset:10 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v2, v4 +; GFX9-NEXT: ds_read_u8 v2, v0 offset:8 +; GFX9-NEXT: ds_read_u8 v4, v0 offset:9 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:10 ; GFX9-NEXT: ds_read_u8 v0, v0 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v7, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v4, v4, v2, v5 +; GFX9-NEXT: v_lshl_or_b32 v2, v4, 8, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v5, v6, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX9-NEXT: v_or3_b32 v2, v4, v5, v0 +; GFX9-NEXT: v_or3_b32 v2, v2, v4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: ds_read_u8 v1, v0 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:7 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v4, v7, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v4, v8, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, v9, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:8 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:10 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_and_b32_e32 v4, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v5, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -151,52 +122,36 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX10-NEXT: ds_read_u8 v2, v0 offset:2 -; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 -; GFX10-NEXT: ds_read_u8 v4, v0 offset:5 -; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 -; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v8, v0 offset:10 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v10, v0 -; GFX10-NEXT: ds_read_u8 v11, v0 offset:4 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:8 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xff -; GFX10-NEXT: v_mov_b32_e32 v13, 8 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_mov_b32 s5, 8 -; GFX10-NEXT: s_waitcnt lgkmcnt(11) -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: ds_read_u8 v1, v0 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:11 ; GFX10-NEXT: s_waitcnt lgkmcnt(10) -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_waitcnt lgkmcnt(7) -; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_and_b32_e32 v6, v6, v12 +; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v8, v8, v12 -; GFX10-NEXT: s_waitcnt lgkmcnt(3) -; GFX10-NEXT: v_and_b32_e32 v9, v9, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v8 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v0 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 ; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 ; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9 @@ -215,76 +170,50 @@ ; GFX9-NEXT: ds_read_u16 v4, v0 offset:6 ; GFX9-NEXT: ds_read_u16 v5, v0 offset:8 ; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_and_b32_e32 v0, s4, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, s4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 -; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_u16 v1, v0 ; GFX7-NEXT: ds_read_u16 v2, v0 offset:2 ; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 ; GFX7-NEXT: ds_read_u16 v4, v0 offset:6 ; GFX7-NEXT: ds_read_u16 v5, v0 offset:8 ; GFX7-NEXT: ds_read_u16 v6, v0 offset:10 -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: load_lds_v3i32_align2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 -; GFX10-NEXT: ds_read_u16 v2, v0 offset:6 -; GFX10-NEXT: ds_read_u16 v3, v0 offset:10 -; GFX10-NEXT: ds_read_u16 v4, v0 -; GFX10-NEXT: ds_read_u16 v5, v0 offset:4 -; GFX10-NEXT: ds_read_u16 v6, v0 offset:8 -; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX10-NEXT: ds_read_u16 v1, v0 +; GFX10-NEXT: ds_read_u16 v2, v0 offset:2 +; GFX10-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX10-NEXT: ds_read_u16 v4, v0 offset:6 +; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 +; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(3) -; GFX10-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_and_or_b32 v0, v4, s4, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_and_or_b32 v1, v5, s4, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v6, s4, v2 +; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 ret <3 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -18,77 +18,58 @@ ; GFX7-LABEL: load_lds_v4i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: ds_read_u8 v1, v0 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:7 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xff -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v2, v7, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v2, v8, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, v9, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:10 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:11 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:12 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:13 -; GFX7-NEXT: ds_read_u8 v10, v0 offset:14 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_and_b32_e32 v5, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_and_b32_e32 v5, v6, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v6, v9, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_and_b32_e32 v5, v8, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, v10, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -96,63 +77,45 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX10-NEXT: ds_read_u8 v2, v0 offset:2 -; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 -; GFX10-NEXT: ds_read_u8 v4, v0 offset:5 -; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 -; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v8, v0 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:4 -; GFX10-NEXT: ds_read_u8 v10, v0 offset:8 -; GFX10-NEXT: ds_read_u8 v12, v0 offset:10 -; GFX10-NEXT: ds_read_u8 v13, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:12 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:13 -; GFX10-NEXT: ds_read_u8 v16, v0 offset:14 +; GFX10-NEXT: ds_read_u8 v1, v0 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 +; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 ; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 -; GFX10-NEXT: v_mov_b32_e32 v17, 8 -; GFX10-NEXT: s_mov_b32 s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v11, 0xff -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_waitcnt lgkmcnt(15) -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(14) -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(13) -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(12) -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_waitcnt lgkmcnt(11) -; GFX10-NEXT: v_and_b32_e32 v5, v5, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(10) -; GFX10-NEXT: v_and_b32_e32 v6, v6, v11 +; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(7) -; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; GFX10-NEXT: s_waitcnt lgkmcnt(6) +; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_and_b32_e32 v8, v12, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 -; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshl_or_b32 v10, v14, 8, v13 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v12, v16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 ; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 @@ -174,61 +137,45 @@ ; GFX7-LABEL: load_lds_v3i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: ds_read_u8 v1, v0 -; GFX7-NEXT: ds_read_u8 v3, v0 offset:1 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:3 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:4 -; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 -; GFX7-NEXT: ds_read_u8 v8, v0 offset:6 -; GFX7-NEXT: ds_read_u8 v9, v0 offset:7 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(5) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v4, v7, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v4, v8, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, v9, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: ds_read_u8 v4, v0 offset:8 -; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 -; GFX7-NEXT: ds_read_u8 v6, v0 offset:10 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:9 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 ; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 -; GFX7-NEXT: s_waitcnt lgkmcnt(3) -; GFX7-NEXT: v_and_b32_e32 v4, v4, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v5, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, v6, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -236,52 +183,36 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_read_u8 v1, v0 offset:1 -; GFX10-NEXT: ds_read_u8 v2, v0 offset:2 -; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 -; GFX10-NEXT: ds_read_u8 v4, v0 offset:5 -; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 -; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v8, v0 offset:10 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v10, v0 -; GFX10-NEXT: ds_read_u8 v11, v0 offset:4 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:8 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xff -; GFX10-NEXT: v_mov_b32_e32 v13, 8 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_mov_b32 s5, 8 -; GFX10-NEXT: s_waitcnt lgkmcnt(11) -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: ds_read_u8 v1, v0 +; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 +; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 +; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 +; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 +; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 +; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:11 ; GFX10-NEXT: s_waitcnt lgkmcnt(10) -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(9) -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_waitcnt lgkmcnt(7) -; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_and_b32_e32 v6, v6, v12 +; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_and_b32_e32 v8, v8, v12 -; GFX10-NEXT: s_waitcnt lgkmcnt(3) -; GFX10-NEXT: v_and_b32_e32 v9, v9, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v8 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(1) -; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v0 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 ; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 ; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir @@ -12,8 +12,8 @@ ; CHECK-LABEL: name: remove_and_255_zextload ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: %ptr:_(p1) = COPY $vgpr0_vgpr1 - ; CHECK: %load:_(s32) = G_ZEXTLOAD %ptr(p1) :: (load (s8), addrspace 1) - ; CHECK: $vgpr0 = COPY %load(s32) + ; CHECK: %and:_(s32) = G_ZEXTLOAD %ptr(p1) :: (load (s8), addrspace 1) + ; CHECK: $vgpr0 = COPY %and(s32) %ptr:_(p1) = COPY $vgpr0_vgpr1 %load:_(s32) = G_ZEXTLOAD %ptr :: (load (s8), addrspace 1, align 1) %mask:_(s32) = G_CONSTANT i32 255 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-load-and-mask.mir @@ -0,0 +1,24 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s + +# Post-legalizer should not generate illegal extending loads +--- +name: zextload_from_load_and_mask +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: zextload_from_load_and_mask + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 + ; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1) + ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]] + ; CHECK: $vgpr0_vgpr1 = COPY [[AND]](s64) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_CONSTANT i64 255 + %2:_(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1) + %3:_(s64) = G_AND %2, %1 + $vgpr0_vgpr1 = COPY %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -501,7 +501,7 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] @@ -1393,7 +1393,6 @@ ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -387,7 +387,7 @@ ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_endpgm @@ -945,7 +945,6 @@ ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -851,21 +851,16 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) -; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 @@ -1178,21 +1173,16 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) -; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 @@ -1304,21 +1294,16 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) -; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 @@ -1438,21 +1423,16 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff -; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[4:5] offset:1 ; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[4:5] offset:2 ; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[4:5] offset:3 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) -; GFX9-GISEL-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, s0, v2 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX9-GISEL-NEXT: v_or3_b32 v1, v1, v2, v3 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -350,39 +350,30 @@ ; ALIGNED-GISEL-LABEL: ds12align1: ; ALIGNED-GISEL: ; %bb.0: ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; ALIGNED-GISEL-NEXT: s_mov_b32 s3, 8 -; ALIGNED-GISEL-NEXT: s_movk_i32 s2, 0xff -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, 0xff ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v2 -; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v2 offset:1 -; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v2 offset:2 -; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v2 offset:3 -; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v2 offset:4 -; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v2 offset:5 -; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v2 offset:6 -; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v2 offset:7 +; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v2 offset:1 +; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v2 offset:2 +; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v2 offset:3 +; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v2 offset:4 +; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v2 offset:5 +; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v2 offset:6 +; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v2 offset:7 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v3, s3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; ALIGNED-GISEL-NEXT: v_and_or_b32 v0, v0, s2, v3 +; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-GISEL-NEXT: v_and_b32_e32 v3, s2, v4 +; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_and_b32_e32 v4, s2, v5 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v1, v3 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v3, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v6, 8, v5 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) -; ALIGNED-GISEL-NEXT: v_and_b32_e32 v4, v8, v1 +; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; ALIGNED-GISEL-NEXT: v_and_b32_e32 v1, v9, v1 -; ALIGNED-GISEL-NEXT: v_and_or_b32 v3, v6, s2, v3 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v3, v4, v1 +; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v1, v3, v4 ; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v2 offset:8 ; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v2 offset:9 ; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v2 offset:10 @@ -453,34 +444,29 @@ ; ALIGNED-GISEL-LABEL: ds12align2: ; ALIGNED-GISEL: ; %bb.0: ; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 0xffff ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 -; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 -; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 -; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 -; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 -; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10 +; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:2 +; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:4 +; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:6 +; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:8 +; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:10 +; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) -; ALIGNED-GISEL-NEXT: v_and_b32_e32 v0, s2, v2 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v3, 16, v1 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) -; ALIGNED-GISEL-NEXT: v_and_b32_e32 v2, s2, v4 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; ALIGNED-GISEL-NEXT: v_and_or_b32 v0, v1, s2, v0 -; ALIGNED-GISEL-NEXT: v_and_or_b32 v1, v3, s2, v2 -; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v0 -; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v2 offset:2 +; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v5, 16, v4 +; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 +; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v3 offset:2 ; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v1 offset:4 -; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v0 offset:6 +; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v1 offset:4 +; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 offset:6 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v5 offset:8 +; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v6 offset:8 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v6 offset:10 +; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v7 offset:10 ; ALIGNED-GISEL-NEXT: s_endpgm ; ; UNALIGNED-LABEL: ds12align2: