diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -388,6 +388,7 @@ } setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); // FIXME: This should be narrowed to i32, but that only happens if i64 is // illegal. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -96,7 +96,8 @@ unsigned Opcode) const; void splitScalar64BitUnaryOp(SetVectorType &Worklist, - MachineInstr &Inst, unsigned Opcode) const; + MachineInstr &Inst, unsigned Opcode, + bool Swap = false) const; void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5742,6 +5742,11 @@ Inst.eraseFromParent(); continue; + case AMDGPU::S_BREV_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); + Inst.eraseFromParent(); + continue; + case AMDGPU::S_NOT_B64: splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); Inst.eraseFromParent(); @@ -6292,7 +6297,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp( SetVectorType &Worklist, MachineInstr &Inst, - unsigned Opcode) const { + unsigned Opcode, bool Swap) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6325,6 +6330,9 @@ Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); + if (Swap) + std::swap(DestSub0, DestSub1); + Register FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -195,7 +195,9 @@ def S_BREV_B32 : SOP1_32 <"s_brev_b32", [(set i32:$sdst, (bitreverse i32:$src0))] >; -def S_BREV_B64 : SOP1_64 <"s_brev_b64">; +def S_BREV_B64 : SOP1_64 <"s_brev_b64", + [(set i64:$sdst, (bitreverse i64:$src0))] +>; let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -235,92 +235,28 @@ define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 -; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 -; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 -; SI-NEXT: v_alignbit_b32 v3, s3, s3, 24 -; SI-NEXT: v_bfi_b32 v4, s4, v1, v0 -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f -; SI-NEXT: v_bfi_b32 v2, s4, v3, v2 -; SI-NEXT: v_and_b32_e32 v1, s2, v4 -; SI-NEXT: v_and_b32_e32 v0, s2, v2 -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: v_and_b32_e32 v3, s2, v4 -; SI-NEXT: v_and_b32_e32 v2, s2, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: s_mov_b32 s2, 0x33333333 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_and_b32_e32 v1, s2, v3 -; SI-NEXT: v_and_b32_e32 v0, s2, v2 -; SI-NEXT: s_mov_b32 s2, 0xcccccccc -; SI-NEXT: v_and_b32_e32 v3, s2, v3 -; SI-NEXT: v_and_b32_e32 v2, s2, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: s_mov_b32 s2, 0x55555555 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_and_b32_e32 v1, s2, v3 -; SI-NEXT: v_and_b32_e32 v0, s2, v2 -; SI-NEXT: s_mov_b32 s2, 0xaaaaaaaa -; SI-NEXT: v_and_b32_e32 v3, s2, v3 -; SI-NEXT: v_and_b32_e32 v2, s2, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_brev_b64 s[0:1], s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; FLAT-NEXT: v_mov_b32_e32 v0, 0x10203 -; FLAT-NEXT: s_mov_b32 s4, 0xf0f0f0f -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v2, 0, s2, v0 -; FLAT-NEXT: v_perm_b32 v4, 0, s3, v0 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: v_and_b32_e32 v1, s4, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s4, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: s_mov_b32 s2, 0x33333333 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 -; FLAT-NEXT: s_mov_b32 s2, 0xcccccccc -; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] -; FLAT-NEXT: s_mov_b32 s2, 0x55555555 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 -; FLAT-NEXT: s_mov_b32 s2, 0xaaaaaaaa -; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 -; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] +; FLAT-NEXT: v_mov_b32_e32 v0, s0 +; FLAT-NEXT: v_mov_b32_e32 v1, s1 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, i64 addrspace(1)* %out @@ -339,46 +275,11 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00ff -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 -; SI-NEXT: s_mov_b32 s6, 0xcccccccc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 -; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_bfi_b32 v2, s0, v0, v2 -; SI-NEXT: v_bfi_b32 v4, s0, v1, v3 -; SI-NEXT: v_and_b32_e32 v1, s1, v2 -; SI-NEXT: v_and_b32_e32 v0, s1, v4 -; SI-NEXT: v_and_b32_e32 v3, s2, v2 -; SI-NEXT: v_and_b32_e32 v2, s2, v4 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: s_mov_b32 s0, 0x55555555 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 -; SI-NEXT: v_and_b32_e32 v3, s6, v3 -; SI-NEXT: v_and_b32_e32 v2, s6, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s0, v3 -; SI-NEXT: v_and_b32_e32 v0, s0, v2 -; SI-NEXT: v_and_b32_e32 v3, s1, v3 -; SI-NEXT: v_and_b32_e32 v2, s1, v2 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_bfrev_b32_e32 v2, v0 +; SI-NEXT: v_bfrev_b32_e32 v1, v1 +; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i64: @@ -386,48 +287,17 @@ ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 -; FLAT-NEXT: s_mov_b32 s6, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x10203 -; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_waitcnt vmcnt(0) -; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0 -; FLAT-NEXT: v_perm_b32 v4, 0, v1, s0 -; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: s_mov_b32 s0, 0x55555555 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s6, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s6, v2 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] -; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 +; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 +; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid @@ -442,76 +312,15 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s8, 0xff00ff -; SI-NEXT: s_mov_b32 s9, 0x33333333 -; SI-NEXT: s_mov_b32 s10, 0xcccccccc -; SI-NEXT: s_mov_b32 s11, 0x55555555 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 -; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 -; SI-NEXT: v_bfi_b32 v3, s8, v1, v0 -; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 -; SI-NEXT: v_alignbit_b32 v0, s3, s3, 24 -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f -; SI-NEXT: v_bfi_b32 v2, s8, v0, v2 -; SI-NEXT: s_mov_b32 s3, 0xf0f0f0f0 -; SI-NEXT: v_and_b32_e32 v0, s2, v2 -; SI-NEXT: v_and_b32_e32 v1, s2, v3 -; SI-NEXT: v_and_b32_e32 v2, s3, v2 -; SI-NEXT: v_and_b32_e32 v3, s3, v3 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: v_alignbit_b32 v4, s0, s0, 8 -; SI-NEXT: v_alignbit_b32 v5, s0, s0, 24 -; SI-NEXT: v_bfi_b32 v7, s8, v5, v4 -; SI-NEXT: v_alignbit_b32 v4, s1, s1, 8 -; SI-NEXT: v_alignbit_b32 v5, s1, s1, 24 -; SI-NEXT: v_bfi_b32 v6, s8, v5, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_and_b32_e32 v0, s9, v2 -; SI-NEXT: v_and_b32_e32 v1, s9, v3 -; SI-NEXT: v_and_b32_e32 v4, s2, v6 -; SI-NEXT: v_and_b32_e32 v5, s2, v7 -; SI-NEXT: v_and_b32_e32 v2, s10, v2 -; SI-NEXT: v_and_b32_e32 v3, s10, v3 -; SI-NEXT: v_and_b32_e32 v6, s3, v6 -; SI-NEXT: v_and_b32_e32 v7, s3, v7 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v5 -; SI-NEXT: s_mov_b32 s12, 0xaaaaaaaa -; SI-NEXT: v_and_b32_e32 v0, s11, v2 -; SI-NEXT: v_and_b32_e32 v1, s11, v3 -; SI-NEXT: v_and_b32_e32 v4, s9, v6 -; SI-NEXT: v_and_b32_e32 v5, s9, v7 -; SI-NEXT: v_and_b32_e32 v2, s12, v2 -; SI-NEXT: v_and_b32_e32 v3, s12, v3 -; SI-NEXT: v_and_b32_e32 v6, s10, v6 -; SI-NEXT: v_and_b32_e32 v7, s10, v7 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 2 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v0, v6, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v5 -; SI-NEXT: v_and_b32_e32 v5, s11, v7 -; SI-NEXT: v_and_b32_e32 v4, s11, v0 -; SI-NEXT: v_and_b32_e32 v6, s12, v0 -; SI-NEXT: v_and_b32_e32 v7, s12, v7 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_or_b32_e32 v0, v6, v4 -; SI-NEXT: v_or_b32_e32 v1, v7, v5 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_brev_b64 s[2:3], s[2:3] +; SI-NEXT: s_brev_b64 s[0:1], s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -519,68 +328,15 @@ ; FLAT: ; %bb.0: ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 -; FLAT-NEXT: v_mov_b32_e32 v4, 0x10203 -; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s9, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s10, 0x55555555 -; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v3, 0, s2, v4 -; FLAT-NEXT: v_perm_b32 v2, 0, s3, v4 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: v_perm_b32 v7, 0, s0, v4 -; FLAT-NEXT: v_perm_b32 v6, 0, s1, v4 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s8, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s8, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s2, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s2, v7 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: s_mov_b32 s11, 0xaaaaaaaa -; FLAT-NEXT: v_and_b32_e32 v0, s10, v2 -; FLAT-NEXT: v_and_b32_e32 v1, s10, v3 -; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 -; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s11, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s11, v3 -; FLAT-NEXT: v_and_b32_e32 v6, s9, v6 -; FLAT-NEXT: v_and_b32_e32 v7, s9, v7 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 2, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 -; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: v_and_b32_e32 v5, s10, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s10, v0 -; FLAT-NEXT: v_and_b32_e32 v6, s11, v0 -; FLAT-NEXT: v_and_b32_e32 v7, s11, v7 -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 -; FLAT-NEXT: v_or_b32_e32 v1, v7, v5 +; FLAT-NEXT: s_waitcnt lgkmcnt(0) +; FLAT-NEXT: s_brev_b64 s[2:3], s[2:3] +; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] +; FLAT-NEXT: v_mov_b32_e32 v0, s0 +; FLAT-NEXT: v_mov_b32_e32 v1, s1 +; FLAT-NEXT: v_mov_b32_e32 v2, s2 +; FLAT-NEXT: v_mov_b32_e32 v3, s3 ; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 @@ -600,76 +356,13 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00ff -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 -; SI-NEXT: s_mov_b32 s8, 0xcccccccc -; SI-NEXT: s_mov_b32 s9, 0x55555555 -; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8 -; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 -; SI-NEXT: v_alignbit_b32 v5, v3, v3, 8 -; SI-NEXT: v_alignbit_b32 v6, v0, v0, 8 -; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 -; SI-NEXT: v_bfi_b32 v2, s0, v2, v4 -; SI-NEXT: v_bfi_b32 v4, s0, v3, v5 -; SI-NEXT: v_bfi_b32 v6, s0, v0, v6 -; SI-NEXT: v_bfi_b32 v8, s0, v1, v7 -; SI-NEXT: v_and_b32_e32 v1, s1, v2 -; SI-NEXT: v_and_b32_e32 v0, s1, v4 -; SI-NEXT: v_and_b32_e32 v3, s2, v2 -; SI-NEXT: v_and_b32_e32 v2, s2, v4 -; SI-NEXT: v_and_b32_e32 v5, s1, v6 -; SI-NEXT: v_and_b32_e32 v4, s1, v8 -; SI-NEXT: v_and_b32_e32 v7, s2, v6 -; SI-NEXT: v_and_b32_e32 v6, s2, v8 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v7, v7, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 -; SI-NEXT: v_and_b32_e32 v5, s3, v7 -; SI-NEXT: v_and_b32_e32 v4, s3, v6 -; SI-NEXT: v_and_b32_e32 v3, s8, v3 -; SI-NEXT: v_and_b32_e32 v2, s8, v2 -; SI-NEXT: v_and_b32_e32 v7, s8, v7 -; SI-NEXT: v_and_b32_e32 v6, s8, v6 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 2 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v7, v7, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_and_b32_e32 v1, s9, v3 -; SI-NEXT: v_and_b32_e32 v0, s9, v2 -; SI-NEXT: v_and_b32_e32 v5, s9, v7 -; SI-NEXT: v_and_b32_e32 v4, s9, v6 -; SI-NEXT: v_and_b32_e32 v3, s10, v3 -; SI-NEXT: v_and_b32_e32 v2, s10, v2 -; SI-NEXT: v_and_b32_e32 v7, s10, v7 -; SI-NEXT: v_and_b32_e32 v6, s10, v6 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v5 -; SI-NEXT: v_or_b32_e32 v0, v6, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: v_bfrev_b32_e32 v4, v2 +; SI-NEXT: v_bfrev_b32_e32 v3, v3 +; SI-NEXT: v_bfrev_b32_e32 v2, v0 +; SI-NEXT: v_bfrev_b32_e32 v1, v1 +; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i64: @@ -677,74 +370,19 @@ ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s3, 0x33333333 -; FLAT-NEXT: s_mov_b32 s8, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x10203 -; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s9, 0x55555555 -; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) -; FLAT-NEXT: v_perm_b32 v6, 0, v0, s0 -; FLAT-NEXT: v_perm_b32 v4, 0, v3, s0 -; FLAT-NEXT: v_perm_b32 v2, 0, v2, s0 -; FLAT-NEXT: v_perm_b32 v8, 0, v1, s0 -; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 -; FLAT-NEXT: v_and_b32_e32 v5, s1, v6 -; FLAT-NEXT: v_and_b32_e32 v4, s1, v8 -; FLAT-NEXT: v_and_b32_e32 v7, s2, v6 -; FLAT-NEXT: v_and_b32_e32 v6, s2, v8 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 4, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s8, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s8, v6 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 2, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 -; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s9, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s9, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s9, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s10, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s10, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s10, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s10, v6 -; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] -; FLAT-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] -; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 -; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v1, v7, v5 -; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 -; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; FLAT-NEXT: v_bfrev_b32_e32 v4, v2 +; FLAT-NEXT: v_bfrev_b32_e32 v3, v3 +; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 +; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 +; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid