Index: llvm/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.td +++ llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1062,6 +1062,11 @@ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>; +def isGFX7GFX8 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<(all_of FeatureSouthernIslands, FeatureCIInsts)>; + def isGFX7GFX8GFX9 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -92,6 +92,10 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_ds_128bit_8byte_aligned : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_mubuf_addr64 : GIComplexOperandMatcher, GIComplexPatternEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -205,6 +205,10 @@ bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; + bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1) const; + bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1, bool IsDS128) const; bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, @@ -1234,38 +1238,52 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const { + return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, false); +} + +bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base, + SDValue &Offset0, + SDValue &Offset1) const { + return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, true); +} + +bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base, + SDValue &Offset0, SDValue &Offset1, + bool IsDS128) const { SDLoc DL(Addr); + unsigned Align = IsDS128 ? 8 : 4; if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); ConstantSDNode *C1 = cast(N1); - unsigned DWordOffset0 = C1->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; + unsigned OffsetValue0 = C1->getZExtValue() / Align; + unsigned OffsetValue1 = OffsetValue0 + 1; // (add n0, c0) - if (isDSOffsetLegal(N0, DWordOffset1, 8)) { + if (isDSOffsetLegal(N0, OffsetValue1, 8)) { Base = N0; - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8); return true; } } else if (Addr.getOpcode() == ISD::SUB) { // sub C, x -> add (sub 0, x), C - if (const ConstantSDNode *C = dyn_cast(Addr.getOperand(0))) { - unsigned DWordOffset0 = C->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; + if (const ConstantSDNode *C = + dyn_cast(Addr.getOperand(0))) { + unsigned OffsetValue0 = C->getZExtValue() / Align; + unsigned OffsetValue1 = OffsetValue0 + 1; - if (isUInt<8>(DWordOffset0)) { + if (isUInt<8>(OffsetValue0)) { SDLoc DL(Addr); SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); // XXX - This is kind of hacky. Create a dummy sub node so we can check // the known bits in isDSOffsetLegal. We need to emit the selected node // here, so this is thrown away. - SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, - Zero, Addr.getOperand(1)); + SDValue Sub = + CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1)); - if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { + if (isDSOffsetLegal(Sub, OffsetValue1, 8)) { SmallVector Opnds; Opnds.push_back(Zero); Opnds.push_back(Addr.getOperand(1)); @@ -1276,29 +1294,28 @@ CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit } - MachineSDNode *MachineSub - = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); + MachineSDNode *MachineSub = CurDAG->getMachineNode( + SubOp, DL, (IsDS128 ? MVT::i64 : MVT::i32), Opnds); Base = SDValue(MachineSub, 0); - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8); return true; } } } } else if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { - unsigned DWordOffset0 = CAddr->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; - assert(4 * DWordOffset0 == CAddr->getZExtValue()); + unsigned OffsetValue0 = CAddr->getZExtValue() / Align; + unsigned OffsetValue1 = OffsetValue0 + 1; + assert(Align * OffsetValue0 == CAddr->getZExtValue()); - if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { + if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1)) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); - MachineSDNode *MovZero - = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - DL, MVT::i32, Zero); + MachineSDNode *MovZero = + CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); Base = SDValue(MovZero, 0); - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8); return true; } } Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -196,11 +196,17 @@ InstructionSelector::ComplexRendererFns selectDS1Addr1Offset(MachineOperand &Root) const; - std::pair - selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectDS64Bit4ByteAligned(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectDS128Bit8ByteAligned(MachineOperand &Root) const; + + std::pair + selectDSReadWrite2Impl(MachineOperand &Root, bool IsDS128) const; + InstructionSelector::ComplexRendererFns + selectDSReadWrite2(MachineOperand &Root, bool IsDS128) const; + std::pair getPtrBaseWithConstantOffset(Register Root, const MachineRegisterInfo &MRI) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3343,9 +3343,20 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { + return selectDSReadWrite2(Root, false); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const { + return selectDSReadWrite2(Root, true); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root, + bool IsDS128) const { Register Reg; unsigned Offset; - std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root); + std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, IsDS128); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, @@ -3354,7 +3365,8 @@ } std::pair -AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const { +AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, + bool IsDS128) const { const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); if (!RootDef) return std::make_pair(Root.getReg(), 0); @@ -3367,11 +3379,11 @@ getPtrBaseWithConstantOffset(Root.getReg(), *MRI); if (Offset) { - int64_t DWordOffset0 = Offset / 4; - int64_t DWordOffset1 = DWordOffset0 + 1; - if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) { + int64_t OffsetValue0 = Offset / (IsDS128 ? 8 : 4); + int64_t OffsetValue1 = OffsetValue0 + 1; + if (isDSOffsetLegal(PtrBase, OffsetValue1, (IsDS128 ? 16 : 8))) { // (add n0, c0) - return std::make_pair(PtrBase, DWordOffset0); + return std::make_pair(PtrBase, OffsetValue0); } } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { // TODO Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -485,17 +485,16 @@ defm atomic_load_fadd : ret_noret_binary_atomic_op; defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op; - -def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> { +def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, + Aligned<8> { let IsLoad = 1; let IsNonExtLoad = 1; - let MinAlignment = 8; } -def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> { +def load_align16_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, + Aligned<16> { let IsLoad = 1; let IsNonExtLoad = 1; - let MinAlignment = 16; } def store_align8_local: PatFrag<(ops node:$val, node:$ptr), Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -680,7 +680,29 @@ defm : DSReadPat_mc ; } -defm : DSReadPat_mc ; +let SubtargetPredicate = isGFX7GFX8 in { + +foreach vt = VReg_96.RegTypes in { +defm : DSReadPat_mc ; +} + +foreach vt = VReg_128.RegTypes in { +defm : DSReadPat_mc ; +} + +} + +let SubtargetPredicate = isGFX9Plus in { + +foreach vt = VReg_96.RegTypes in { +defm : DSReadPat_mc ; +} + +foreach vt = VReg_128.RegTypes in { +defm : DSReadPat_mc ; +} + +} } // End AddedComplexity = 100 @@ -761,6 +783,18 @@ (i1 0)) >; +class DS128Bit8ByteAlignedReadPat : GCNPat < + (vt:$value (frag (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), + (inst $ptr, $offset0, $offset1, (i1 0)) +>; + +class DS128Bit8ByteAlignedWritePat : GCNPat< + (frag vt:$value, (DS128Bit8ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), + (inst $ptr, (i64 (EXTRACT_SUBREG VReg_128:$value, sub0_sub1)), + (i64 (EXTRACT_SUBREG VReg_128:$value, sub2_sub3)), $offset0, $offset1, + (i1 0)) +>; + multiclass DS64Bit4ByteAlignedPat_mc { let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in { def : DS64Bit4ByteAlignedReadPat; @@ -773,19 +807,57 @@ } } +multiclass DS128Bit8ByteAlignedPat_mc { + let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in { + def : DS128Bit8ByteAlignedReadPat; + def : DS128Bit8ByteAlignedWritePat; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DS128Bit8ByteAlignedReadPat; + def : DS128Bit8ByteAlignedWritePat; + } +} + // v2i32 loads are split into i32 loads on SI during lowering, due to a bug // related to bounds checking. foreach vt = VReg_64.RegTypes in { defm : DS64Bit4ByteAlignedPat_mc; } +foreach vt = VReg_128.RegTypes in { +defm : DS128Bit8ByteAlignedPat_mc; +} + let AddedComplexity = 100 in { foreach vt = VReg_64.RegTypes in { defm : DSWritePat_mc ; } -defm : DSWritePat_mc ; +let SubtargetPredicate = isGFX7GFX8 in { + +foreach vt = VReg_96.RegTypes in { +defm : DSWritePat_mc ; +} + +foreach vt = VReg_128.RegTypes in { +defm : DSWritePat_mc ; +} + +} + +let SubtargetPredicate = isGFX9Plus in { + +foreach vt = VReg_96.RegTypes in { +defm : DSWritePat_mc ; +} + +foreach vt = VReg_128.RegTypes in { +defm : DSWritePat_mc ; +} + +} } // End AddedComplexity = 100 class DSAtomicRetPat : GCNPat < Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -450,16 +450,15 @@ } def load_align8_local_m0 : PatFrag<(ops node:$ptr), - (load_local_m0 node:$ptr)> { + (load_local_m0 node:$ptr)>, Aligned<8> { let IsLoad = 1; let IsNonExtLoad = 1; - let MinAlignment = 8; } + def load_align16_local_m0 : PatFrag<(ops node:$ptr), - (load_local_m0 node:$ptr)> { + (load_local_m0 node:$ptr)>, Aligned<16> { let IsLoad = 1; let IsNonExtLoad = 1; - let MinAlignment = 16; } } // End IsLoad = 1 @@ -535,20 +534,18 @@ } } -def store_align16_local_m0 : PatFrag < - (ops node:$value, node:$ptr), - (store_local_m0 node:$value, node:$ptr)> { +def store_align8_local_m0 : PatFrag <(ops node:$value, node:$ptr), + (store_local_m0 node:$value, node:$ptr)>, + Aligned<8> { let IsStore = 1; let IsTruncStore = 0; - let MinAlignment = 16; } -def store_align8_local_m0 : PatFrag < - (ops node:$value, node:$ptr), - (store_local_m0 node:$value, node:$ptr)> { +def store_align16_local_m0 : PatFrag <(ops node:$value, node:$ptr), + (store_local_m0 node:$value, node:$ptr)>, + Aligned<16> { let IsStore = 1; let IsTruncStore = 0; - let MinAlignment = 8; } let AddressSpaces = StoreAddress_local.AddrSpaces in { @@ -1308,6 +1305,7 @@ def DS1Addr1Offset : ComplexPattern; def DS64Bit4ByteAligned : ComplexPattern; +def DS128Bit8ByteAligned : ComplexPattern; def MOVRELOffset : ComplexPattern; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -0,0 +1,302 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +; FIXME: +; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s + +define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b128 v[0:3], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr + ret <4 x i32> %load +} + +define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v4i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u8 v1, v0 +; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX9-NEXT: ds_read_u8 v4, v0 offset:2 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:3 +; GFX9-NEXT: ds_read_u8 v6, v0 offset:4 +; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NEXT: v_or3_b32 v4, v1, v2, v4 +; GFX9-NEXT: ds_read_u8 v1, v0 offset:5 +; GFX9-NEXT: ds_read_u8 v2, v0 offset:6 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:7 +; GFX9-NEXT: ds_read_u8 v7, v0 offset:8 +; GFX9-NEXT: ds_read_u8 v8, v0 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v2, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 +; GFX9-NEXT: ds_read_u8 v6, v0 offset:10 +; GFX9-NEXT: ds_read_u8 v7, v0 offset:11 +; GFX9-NEXT: ds_read_u8 v8, v0 offset:12 +; GFX9-NEXT: ds_read_u8 v9, v0 offset:13 +; GFX9-NEXT: ds_read_u8 v10, v0 offset:14 +; GFX9-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX9-NEXT: s_waitcnt lgkmcnt(5) +; GFX9-NEXT: v_and_b32_e32 v6, v6, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_and_b32_e32 v7, v7, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: v_or3_b32 v2, v2, v6, v7 +; GFX9-NEXT: v_and_b32_e32 v6, v10, v3 +; GFX9-NEXT: v_and_or_b32 v5, v8, v3, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: v_or3_b32 v3, v5, v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v4i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u8 v1, v0 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:4 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: ds_read_u8 v2, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:7 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:8 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:9 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v2, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v2, v6, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v5, v8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v5, v6, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v6, v9, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 + ret <4 x i32> %load +} + +define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v4i32_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v1, v0 +; GFX9-NEXT: ds_read_u16 v2, v0 offset:2 +; GFX9-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX9-NEXT: ds_read_u16 v5, v0 offset:6 +; GFX9-NEXT: ds_read_u16 v6, v0 offset:8 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, s4, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 +; GFX9-NEXT: ds_read_u16 v2, v0 offset:10 +; GFX9-NEXT: ds_read_u16 v3, v0 offset:12 +; GFX9-NEXT: ds_read_u16 v0, v0 offset:14 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v3, v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v2, v6, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v4i32_align2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v1, v0 +; GFX7-NEXT: ds_read_u16 v2, v0 offset:2 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX7-NEXT: ds_read_u16 v5, v0 offset:6 +; GFX7-NEXT: ds_read_u16 v6, v0 offset:8 +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:10 +; GFX7-NEXT: ds_read_u16 v5, v0 offset:12 +; GFX7-NEXT: ds_read_u16 v0, v0 offset:14 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 + ret <4 x i32> %load +} + +define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v4i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v4i32_align4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 + ret <4 x i32> %load +} + +define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v4i32_align8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v4i32_align8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 + ret <4 x i32> %load +} + +define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v4i32_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v4i32_align16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b128 v[0:3], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 + ret <4 x i32> %load +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -0,0 +1,262 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +; FIXME: +; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s + +define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b96 v[0:2], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_u8 v0, v0 +; GFX9-NEXT: ds_read_u8 v1, v2 offset:1 +; GFX9-NEXT: ds_read_u8 v4, v2 offset:2 +; GFX9-NEXT: ds_read_u8 v5, v2 offset:3 +; GFX9-NEXT: ds_read_u8 v6, v2 offset:4 +; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v4 +; GFX9-NEXT: ds_read_u8 v1, v2 offset:5 +; GFX9-NEXT: ds_read_u8 v4, v2 offset:6 +; GFX9-NEXT: ds_read_u8 v5, v2 offset:7 +; GFX9-NEXT: ds_read_u8 v7, v2 offset:8 +; GFX9-NEXT: ds_read_u8 v8, v2 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v4, v5 +; GFX9-NEXT: ds_read_u8 v4, v2 offset:10 +; GFX9-NEXT: ds_read_u8 v2, v2 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v5, v7, v3, v5 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v5, v4, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: ds_read_u8 v0, v0 +; GFX7-NEXT: ds_read_u8 v1, v2 offset:1 +; GFX7-NEXT: ds_read_u8 v4, v2 offset:2 +; GFX7-NEXT: ds_read_u8 v5, v2 offset:3 +; GFX7-NEXT: ds_read_u8 v6, v2 offset:4 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: ds_read_u8 v4, v2 offset:5 +; GFX7-NEXT: ds_read_u8 v5, v2 offset:6 +; GFX7-NEXT: ds_read_u8 v6, v2 offset:7 +; GFX7-NEXT: ds_read_u8 v7, v2 offset:8 +; GFX7-NEXT: ds_read_u8 v8, v2 offset:9 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v4, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: ds_read_u8 v4, v2 offset:10 +; GFX7-NEXT: ds_read_u8 v2, v2 offset:11 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: ds_read_u16 v1, v2 offset:2 +; GFX9-NEXT: ds_read_u16 v3, v2 offset:4 +; GFX9-NEXT: ds_read_u16 v4, v2 offset:6 +; GFX9-NEXT: ds_read_u16 v5, v2 offset:8 +; GFX9-NEXT: ds_read_u16 v2, v2 offset:10 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: ds_read_u16 v1, v2 offset:2 +; GFX7-NEXT: ds_read_u16 v3, v2 offset:4 +; GFX7-NEXT: ds_read_u16 v4, v2 offset:6 +; GFX7-NEXT: ds_read_u16 v5, v2 offset:8 +; GFX7-NEXT: ds_read_u16 v2, v2 offset:10 +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[0:1], v0 +; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b96 v[0:2], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 + ret <3 x i32> %load +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -0,0 +1,253 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +; Unaligned DS access in available from GFX9 onwards. +; LDS alignment enforcement is controlled by a configuration register: +; SH_MEM_CONFIG.alignment_mode + +define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v4i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v4i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u8 v1, v0 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:4 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: ds_read_u8 v2, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:7 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:8 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:9 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v2, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v2, v6, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v5, v8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v5, v6, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v6, v9, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 + ret <4 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: ds_read_u8 v0, v0 +; GFX7-NEXT: ds_read_u8 v1, v2 offset:1 +; GFX7-NEXT: ds_read_u8 v4, v2 offset:2 +; GFX7-NEXT: ds_read_u8 v5, v2 offset:3 +; GFX7-NEXT: ds_read_u8 v6, v2 offset:4 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: ds_read_u8 v4, v2 offset:5 +; GFX7-NEXT: ds_read_u8 v5, v2 offset:6 +; GFX7-NEXT: ds_read_u8 v6, v2 offset:7 +; GFX7-NEXT: ds_read_u8 v7, v2 offset:8 +; GFX7-NEXT: ds_read_u8 v8, v2 offset:9 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v4, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: ds_read_u8 v4, v2 offset:10 +; GFX7-NEXT: ds_read_u8 v2, v2 offset:11 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 + ret <3 x i32> %load +} + +define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v4i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b128 v0, v[1:4] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: store_lds_v4i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: ds_write_b8 v0, v5 offset:1 +; GFX7-NEXT: ds_write_b8 v0, v6 offset:2 +; GFX7-NEXT: ds_write_b8 v0, v7 offset:3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: ds_write_b8 v0, v5 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v6 offset:7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3 +; GFX7-NEXT: ds_write_b8 v0, v3 offset:8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:10 +; GFX7-NEXT: ds_write_b8 v0, v5 offset:11 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; GFX7-NEXT: ds_write_b8 v0, v4 offset:12 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:14 +; GFX7-NEXT: ds_write_b8 v0, v3 offset:15 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 + ret void +} + +define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b96 v0, v[1:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: store_lds_v3i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: ds_write_b8 v0, v4 offset:1 +; GFX7-NEXT: ds_write_b8 v0, v5 offset:2 +; GFX7-NEXT: ds_write_b8 v0, v6 offset:3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: ds_write_b8 v0, v4 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v5 offset:7 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; GFX7-NEXT: ds_write_b8 v0, v3 offset:8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:10 +; GFX7-NEXT: ds_write_b8 v0, v4 offset:11 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -0,0 +1,397 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +; FIXME: +; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s + +define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v4i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_write_b128 v4, v[0:3] +; GFX7-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v4i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 8 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: s_add_u32 s0, s4, 1 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 2 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 3 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 4 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 5 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 6 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 7 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_add_u32 s0, s4, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 9 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 10 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 11 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_add_u32 s0, s4, 12 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s3, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 13 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s3, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 14 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s5, s3, 24 +; GFX9-NEXT: s_add_u32 s0, s4, 15 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v4i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s5, s0, 8 +; GFX7-NEXT: s_lshr_b32 s6, s0, 16 +; GFX7-NEXT: s_lshr_b32 s7, s0, 24 +; GFX7-NEXT: s_add_u32 s0, s4, 1 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 2 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 3 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 4 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s5, s1, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 5 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s6, s1, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 6 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_lshr_b32 s7, s1, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 7 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 9 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s5, s2, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 10 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_lshr_b32 s6, s2, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 11 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_add_u32 s0, s4, 12 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_lshr_b32 s1, s3, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 13 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s2, s3, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 14 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s5, s3, 24 +; GFX7-NEXT: s_add_u32 s0, s4, 15 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 + ret void +} + +define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v4i32_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 4 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 6 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_add_u32 s0, s4, 8 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 10 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 12 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s3, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 14 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v4i32_align2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 2 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 4 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s5, s1, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 6 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 10 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 12 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_lshr_b32 s1, s3, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 14 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 + ret void +} + +define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v4i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v4i32_align4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 4 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_add_u32 s0, s4, 12 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v4i32_align8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v4i32_align8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX7-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 + ret void +} + +define amdgpu_kernel void @store_lds_v4i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v4i32_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v4i32_align16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_write_b128 v4, v[0:3] +; GFX7-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16 + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -0,0 +1,330 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +; FIXME: +; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s + +define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b96 v3, v[0:2] +; GFX7-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_add_u32 s0, s4, 1 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 2 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 3 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 4 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 5 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 6 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_lshr_b32 s6, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 7 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_add_u32 s0, s4, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 9 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 10 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NEXT: s_add_u32 s0, s4, 11 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s3, s0, 8 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: s_lshr_b32 s6, s0, 24 +; GFX7-NEXT: s_add_u32 s0, s4, 1 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 2 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 3 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 4 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s3, s1, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 5 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s5, s1, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 6 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_lshr_b32 s6, s1, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 7 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 9 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s3, s2, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 10 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_lshr_b32 s5, s2, 24 +; GFX7-NEXT: s_add_u32 s0, s4, 11 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 4 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 6 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_add_u32 s0, s4, 8 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 10 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 2 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 4 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 6 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 10 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b64 v2, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b96 v3, v[0:2] +; GFX7-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16 + ret void +} Index: llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll =================================================================== --- llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll +++ llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll @@ -526,7 +526,8 @@ } ; CHECK-LABEL: @merge_local_store_4_constants_i32 -; CHECK: store <4 x i32> , <4 x i32> addrspace(3)* +; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* %1, align 4 +; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* %2, align 4 define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 Index: llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll =================================================================== --- llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll +++ llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -1,4 +1,5 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basic-aa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -basic-aa -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -basic-aa -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -6,10 +7,10 @@ ; for a the same head starting a chain. @0 = internal addrspace(3) global [16384 x i32] undef -; CHECK-LABEL: @no_crash( -; CHECK: store <2 x i32> zeroinitializer -; CHECK: store i32 0 -; CHECK: store i32 0 +; GCN-LABEL: @no_crash( +; GCN: store <2 x i32> zeroinitializer +; GCN: store i32 0 +; GCN: store i32 0 define amdgpu_kernel void @no_crash(i32 %arg) { %tmp2 = add i32 %arg, 14 @@ -28,13 +29,22 @@ ; Check adjiacent memory locations are properly matched and the ; longest chain vectorized -; CHECK-LABEL: @interleave_get_longest -; CHECK: load <4 x i32> -; CHECK: load i32 -; CHECK: store <2 x i32> zeroinitializer -; CHECK: load i32 -; CHECK: load i32 -; CHECK: load i32 +; GCN-LABEL: @interleave_get_longest + +; GFX7: load <2 x i32> +; GFX7: load i32 +; GFX7: store <2 x i32> zeroinitializer +; GFX7: load i32 +; GFX7: load <2 x i32> +; GFX7: load i32 +; GFX7: load i32 + +; GFX9: load <4 x i32> +; GFX9: load i32 +; GFX9: store <2 x i32> zeroinitializer +; GFX9: load i32 +; GFX9: load i32 +; GFX9: load i32 define amdgpu_kernel void @interleave_get_longest(i32 %arg) { %a1 = add i32 %arg, 1