diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4877,10 +4877,10 @@ const LLT HalfTy = LLT::scalar(NewBitSize); const LLT CondTy = LLT::scalar(1); - if (const MachineInstr *KShiftAmt = - getOpcodeDef(TargetOpcode::G_CONSTANT, Amt, MRI)) { - return narrowScalarShiftByConstant( - MI, KShiftAmt->getOperand(1).getCImm()->getValue(), HalfTy, ShiftAmtTy); + if (auto VRegAndVal = + getConstantVRegValWithLookThrough(Amt, MRI, true, false)) { + return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy, + ShiftAmtTy); } // TODO: Expand with known bits. diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll @@ -373,112 +373,60 @@ ; ; CHECK-LLSC-O0-LABEL: atomic_load_relaxed: ; CHECK-LLSC-O0: // %bb.0: -; CHECK-LLSC-O0-NEXT: sub sp, sp, #64 -; CHECK-LLSC-O0-NEXT: .cfi_def_cfa_offset 64 -; CHECK-LLSC-O0-NEXT: str x2, [sp, #48] // 8-byte Folded Spill -; CHECK-LLSC-O0-NEXT: str x3, [sp, #56] // 8-byte Folded Spill +; CHECK-LLSC-O0-NEXT: sub sp, sp, #48 +; CHECK-LLSC-O0-NEXT: .cfi_def_cfa_offset 48 +; CHECK-LLSC-O0-NEXT: str x2, [sp, #32] // 8-byte Folded Spill +; CHECK-LLSC-O0-NEXT: str x3, [sp, #40] // 8-byte Folded Spill ; CHECK-LLSC-O0-NEXT: b .LBB4_1 ; CHECK-LLSC-O0-NEXT: .LBB4_1: // %atomicrmw.start ; CHECK-LLSC-O0-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-LLSC-O0-NEXT: ldr x11, [sp, #48] // 8-byte Folded Reload -; CHECK-LLSC-O0-NEXT: ldxp x9, x15, [x11] -; CHECK-LLSC-O0-NEXT: mov x12, xzr -; CHECK-LLSC-O0-NEXT: mov w8, #64 -; CHECK-LLSC-O0-NEXT: // kill: def $x8 killed $w8 -; CHECK-LLSC-O0-NEXT: mov w10, #64 -; CHECK-LLSC-O0-NEXT: // kill: def $x10 killed $w10 -; CHECK-LLSC-O0-NEXT: str x10, [sp, #8] // 8-byte Folded Spill -; CHECK-LLSC-O0-NEXT: subs x16, x10, #64 -; CHECK-LLSC-O0-NEXT: subs x13, x8, #64 -; CHECK-LLSC-O0-NEXT: lsl x14, x15, x10 -; CHECK-LLSC-O0-NEXT: lsr x13, x15, x13 -; CHECK-LLSC-O0-NEXT: orr x13, x13, x12 -; CHECK-LLSC-O0-NEXT: lsl x15, x15, x16 -; CHECK-LLSC-O0-NEXT: subs x16, x10, #64 -; CHECK-LLSC-O0-NEXT: csel x14, x14, x12, lo -; CHECK-LLSC-O0-NEXT: subs x16, x10, #64 -; CHECK-LLSC-O0-NEXT: csel x13, x13, x15, lo -; CHECK-LLSC-O0-NEXT: subs x15, x10, #0 -; CHECK-LLSC-O0-NEXT: csel x13, x12, x13, eq -; CHECK-LLSC-O0-NEXT: orr x9, x9, x14 -; CHECK-LLSC-O0-NEXT: orr x12, x12, x13 +; CHECK-LLSC-O0-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload +; CHECK-LLSC-O0-NEXT: ldxp x9, x10, [x11] +; CHECK-LLSC-O0-NEXT: mov x8, xzr +; CHECK-LLSC-O0-NEXT: orr x9, x9, x8 +; CHECK-LLSC-O0-NEXT: orr x10, x8, x10 ; CHECK-LLSC-O0-NEXT: // implicit-def: $q0 ; CHECK-LLSC-O0-NEXT: mov v0.d[0], x9 +; CHECK-LLSC-O0-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-LLSC-O0-NEXT: mov v0.d[1], x10 ; CHECK-LLSC-O0-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-LLSC-O0-NEXT: mov v0.d[1], x12 -; CHECK-LLSC-O0-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-LLSC-O0-NEXT: subs x13, x10, #64 -; CHECK-LLSC-O0-NEXT: subs x8, x8, #64 -; CHECK-LLSC-O0-NEXT: lsl x8, x12, x8 -; CHECK-LLSC-O0-NEXT: orr x8, x8, x9, lsr #0 -; CHECK-LLSC-O0-NEXT: lsr x12, x12, x13 -; CHECK-LLSC-O0-NEXT: subs x13, x10, #64 -; CHECK-LLSC-O0-NEXT: csel x8, x8, x12, lo -; CHECK-LLSC-O0-NEXT: subs x10, x10, #0 -; CHECK-LLSC-O0-NEXT: csel x10, x9, x8, eq ; CHECK-LLSC-O0-NEXT: stxp w8, x9, x10, [x11] ; CHECK-LLSC-O0-NEXT: cbnz w8, .LBB4_1 ; CHECK-LLSC-O0-NEXT: b .LBB4_2 ; CHECK-LLSC-O0-NEXT: .LBB4_2: // %atomicrmw.end -; CHECK-LLSC-O0-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-LLSC-O0-NEXT: ldr x8, [sp, #56] // 8-byte Folded Reload +; CHECK-LLSC-O0-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-LLSC-O0-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload ; CHECK-LLSC-O0-NEXT: str q0, [x8] -; CHECK-LLSC-O0-NEXT: add sp, sp, #64 +; CHECK-LLSC-O0-NEXT: add sp, sp, #48 ; CHECK-LLSC-O0-NEXT: ret ; ; CHECK-CAS-O0-LABEL: atomic_load_relaxed: ; CHECK-CAS-O0: // %bb.0: -; CHECK-CAS-O0-NEXT: sub sp, sp, #64 -; CHECK-CAS-O0-NEXT: .cfi_def_cfa_offset 64 -; CHECK-CAS-O0-NEXT: str x2, [sp, #48] // 8-byte Folded Spill -; CHECK-CAS-O0-NEXT: str x3, [sp, #56] // 8-byte Folded Spill +; CHECK-CAS-O0-NEXT: sub sp, sp, #48 +; CHECK-CAS-O0-NEXT: .cfi_def_cfa_offset 48 +; CHECK-CAS-O0-NEXT: str x2, [sp, #32] // 8-byte Folded Spill +; CHECK-CAS-O0-NEXT: str x3, [sp, #40] // 8-byte Folded Spill ; CHECK-CAS-O0-NEXT: b .LBB4_1 ; CHECK-CAS-O0-NEXT: .LBB4_1: // %atomicrmw.start ; CHECK-CAS-O0-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-CAS-O0-NEXT: ldr x11, [sp, #48] // 8-byte Folded Reload -; CHECK-CAS-O0-NEXT: ldxp x9, x15, [x11] -; CHECK-CAS-O0-NEXT: mov x12, #0 -; CHECK-CAS-O0-NEXT: mov w8, #64 -; CHECK-CAS-O0-NEXT: // kill: def $x8 killed $w8 -; CHECK-CAS-O0-NEXT: mov w10, #64 -; CHECK-CAS-O0-NEXT: // kill: def $x10 killed $w10 -; CHECK-CAS-O0-NEXT: str x10, [sp, #8] // 8-byte Folded Spill -; CHECK-CAS-O0-NEXT: subs x16, x10, #64 -; CHECK-CAS-O0-NEXT: subs x13, x8, #64 -; CHECK-CAS-O0-NEXT: lsl x14, x15, x10 -; CHECK-CAS-O0-NEXT: lsr x13, x15, x13 -; CHECK-CAS-O0-NEXT: orr x13, x13, x12 -; CHECK-CAS-O0-NEXT: lsl x15, x15, x16 -; CHECK-CAS-O0-NEXT: subs x16, x10, #64 -; CHECK-CAS-O0-NEXT: csel x14, x14, x12, lo -; CHECK-CAS-O0-NEXT: subs x16, x10, #64 -; CHECK-CAS-O0-NEXT: csel x13, x13, x15, lo -; CHECK-CAS-O0-NEXT: subs x15, x10, #0 -; CHECK-CAS-O0-NEXT: csel x13, x12, x13, eq -; CHECK-CAS-O0-NEXT: orr x9, x9, x14 -; CHECK-CAS-O0-NEXT: orr x12, x12, x13 +; CHECK-CAS-O0-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload +; CHECK-CAS-O0-NEXT: ldxp x9, x10, [x11] +; CHECK-CAS-O0-NEXT: mov x8, #0 +; CHECK-CAS-O0-NEXT: orr x9, x9, x8 +; CHECK-CAS-O0-NEXT: orr x10, x8, x10 ; CHECK-CAS-O0-NEXT: // implicit-def: $q0 ; CHECK-CAS-O0-NEXT: mov v0.d[0], x9 +; CHECK-CAS-O0-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-CAS-O0-NEXT: mov v0.d[1], x10 ; CHECK-CAS-O0-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-CAS-O0-NEXT: mov v0.d[1], x12 -; CHECK-CAS-O0-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-CAS-O0-NEXT: subs x13, x10, #64 -; CHECK-CAS-O0-NEXT: subs x8, x8, #64 -; CHECK-CAS-O0-NEXT: lsl x8, x12, x8 -; CHECK-CAS-O0-NEXT: orr x8, x8, x9, lsr #0 -; CHECK-CAS-O0-NEXT: lsr x12, x12, x13 -; CHECK-CAS-O0-NEXT: subs x13, x10, #64 -; CHECK-CAS-O0-NEXT: csel x8, x8, x12, lo -; CHECK-CAS-O0-NEXT: subs x10, x10, #0 -; CHECK-CAS-O0-NEXT: csel x10, x9, x8, eq ; CHECK-CAS-O0-NEXT: stxp w8, x9, x10, [x11] ; CHECK-CAS-O0-NEXT: cbnz w8, .LBB4_1 ; CHECK-CAS-O0-NEXT: b .LBB4_2 ; CHECK-CAS-O0-NEXT: .LBB4_2: // %atomicrmw.end -; CHECK-CAS-O0-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-CAS-O0-NEXT: ldr x8, [sp, #56] // 8-byte Folded Reload +; CHECK-CAS-O0-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-CAS-O0-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload ; CHECK-CAS-O0-NEXT: str q0, [x8] -; CHECK-CAS-O0-NEXT: add sp, sp, #64 +; CHECK-CAS-O0-NEXT: add sp, sp, #48 ; CHECK-CAS-O0-NEXT: ret %r = load atomic i128, i128* %p monotonic, align 16 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-and.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-and.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-and.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-and.mir @@ -44,87 +44,38 @@ ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[DEF]](s32) ; CHECK: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 - ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C3]], [[C4]] - ; CHECK: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C3]] - ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C3]](s64), [[C4]] - ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C3]](s64), [[C]] - ; CHECK: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP1]](s32) ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[MV]], [[C3]](s64) - ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[SUB1]](s64) ; CHECK: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[DEF1]], [[C3]](s64) - ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]] - ; CHECK: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[MV]], [[SUB]](s64) - ; CHECK: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[SHL]], [[C]] - ; CHECK: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[OR]], [[SHL2]] - ; CHECK: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC1]](s1), [[DEF1]], [[SELECT1]] - ; CHECK: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SELECT]], [[ZEXTLOAD]] - ; CHECK: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SELECT2]], [[C]] - ; CHECK: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]] - ; CHECK: [[SUB3:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]] - ; CHECK: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C4]](s64), [[C4]] - ; CHECK: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP2]](s32) - ; CHECK: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C4]](s64), [[C]] - ; CHECK: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP3]](s32) - ; CHECK: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[OR1]], [[C4]](s64) - ; CHECK: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[OR1]], [[SUB3]](s64) - ; CHECK: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[OR2]], [[C4]](s64) - ; CHECK: [[OR3:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL4]] - ; CHECK: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR1]], [[SUB2]](s64) - ; CHECK: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC2]](s1), [[SHL3]], [[C]] - ; CHECK: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC2]](s1), [[OR3]], [[SHL5]] - ; CHECK: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC3]](s1), [[OR2]], [[SELECT4]] - ; CHECK: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT3]], [[LOAD]] - ; CHECK: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[C]] + ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[C4]](s64) + ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]] + ; CHECK: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXTLOAD]] + ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; CHECK: [[OR2:%[0-9]+]]:_(s64) = G_OR [[C]], [[LOAD]] + ; CHECK: [[OR3:%[0-9]+]]:_(s64) = G_OR [[OR1]], [[C]] ; CHECK: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD %ptr(p0) :: (load (s64), align 16) ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0) ; CHECK: [[ZEXTLOAD1:%[0-9]+]]:_(s64) = G_ZEXTLOAD [[COPY1]](p0) :: (load (s16) from unknown-address + 8, align 8) ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C2]](s64) ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 10, align 2) ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD3]](s32), [[DEF]](s32) - ; CHECK: [[SUB4:%[0-9]+]]:_(s64) = G_SUB [[C3]], [[C4]] - ; CHECK: [[SUB5:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C3]] - ; CHECK: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C3]](s64), [[C4]] - ; CHECK: [[TRUNC4:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP4]](s32) - ; CHECK: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C3]](s64), [[C]] - ; CHECK: [[TRUNC5:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP5]](s32) - ; CHECK: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[MV1]], [[C3]](s64) - ; CHECK: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[MV1]], [[SUB5]](s64) - ; CHECK: [[SHL7:%[0-9]+]]:_(s64) = G_SHL [[DEF1]], [[C3]](s64) - ; CHECK: [[OR6:%[0-9]+]]:_(s64) = G_OR [[LSHR2]], [[SHL7]] - ; CHECK: [[SHL8:%[0-9]+]]:_(s64) = G_SHL [[MV1]], [[SUB4]](s64) - ; CHECK: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC4]](s1), [[SHL6]], [[C]] - ; CHECK: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC4]](s1), [[OR6]], [[SHL8]] - ; CHECK: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC5]](s1), [[DEF1]], [[SELECT7]] - ; CHECK: [[OR7:%[0-9]+]]:_(s64) = G_OR [[SELECT6]], [[ZEXTLOAD1]] - ; CHECK: [[OR8:%[0-9]+]]:_(s64) = G_OR [[SELECT8]], [[C]] - ; CHECK: [[SUB6:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]] - ; CHECK: [[SUB7:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]] - ; CHECK: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C4]](s64), [[C4]] - ; CHECK: [[TRUNC6:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP6]](s32) - ; CHECK: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C4]](s64), [[C]] - ; CHECK: [[TRUNC7:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP7]](s32) - ; CHECK: [[SHL9:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[C4]](s64) - ; CHECK: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[OR7]], [[SUB7]](s64) - ; CHECK: [[SHL10:%[0-9]+]]:_(s64) = G_SHL [[OR8]], [[C4]](s64) - ; CHECK: [[OR9:%[0-9]+]]:_(s64) = G_OR [[LSHR3]], [[SHL10]] - ; CHECK: [[SHL11:%[0-9]+]]:_(s64) = G_SHL [[OR7]], [[SUB6]](s64) - ; CHECK: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC6]](s1), [[SHL9]], [[C]] - ; CHECK: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC6]](s1), [[OR9]], [[SHL11]] - ; CHECK: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC7]](s1), [[OR8]], [[SELECT10]] - ; CHECK: [[OR10:%[0-9]+]]:_(s64) = G_OR [[SELECT9]], [[LOAD2]] - ; CHECK: [[OR11:%[0-9]+]]:_(s64) = G_OR [[SELECT11]], [[C]] - ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[OR4]], [[OR10]] - ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[OR5]], [[OR11]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[MV1]], [[C3]](s64) + ; CHECK: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[DEF1]], [[C3]](s64) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[MV1]], [[C4]](s64) + ; CHECK: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SHL3]], [[LSHR1]] + ; CHECK: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXTLOAD1]] + ; CHECK: [[OR6:%[0-9]+]]:_(s64) = G_OR [[C]], [[LOAD2]] + ; CHECK: [[OR7:%[0-9]+]]:_(s64) = G_OR [[OR5]], [[C]] + ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[OR2]], [[OR6]] + ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[OR3]], [[OR7]] ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY [[AND]](s64) - ; CHECK: [[TRUNC8:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) ; CHECK: G_STORE [[COPY2]](s64), %ptr(p0) :: (store (s64), align 16) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC8]](s32) - ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s64) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s64) ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C2]](s64) ; CHECK: G_STORE [[COPY3]](s32), [[PTR_ADD]](p0) :: (store (s16) into unknown-address + 8, align 8) - ; CHECK: G_STORE [[LSHR4]](s32), [[PTR_ADD3]](p0) :: (store (s8) into unknown-address + 10, align 2) + ; CHECK: G_STORE [[LSHR2]](s32), [[PTR_ADD3]](p0) :: (store (s8) into unknown-address + 10, align 2) %ptr:_(p0) = COPY $x0 %a:_(s88) = G_LOAD %ptr(p0) :: (load (s88)) %b:_(s88) = G_LOAD %ptr(p0) :: (load (s88)) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir @@ -151,22 +151,11 @@ ; CHECK: [[BSWAP1:%[0-9]+]]:_(s64) = G_BSWAP [[DEF]] ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 - ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[C2]] - ; CHECK: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C2]], [[C]] - ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C]](s64), [[C2]] - ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C]](s64), [[C1]] - ; CHECK: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP1]](s32) - ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP1]], [[C]](s64) - ; CHECK: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64) - ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[BSWAP1]], [[SUB1]](s64) - ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]] - ; CHECK: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP1]], [[SUB]](s64) - ; CHECK: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[OR]], [[LSHR2]] - ; CHECK: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC1]](s1), [[BSWAP]], [[SELECT]] - ; CHECK: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[LSHR]], [[C1]] - ; CHECK: $x0 = COPY [[SELECT1]](s64) + ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[BSWAP1]], [[C2]](s64) + ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]] + ; CHECK: $x0 = COPY [[OR]](s64) ; CHECK: RET_ReallyLR implicit $x0 %val:_(s88) = G_IMPLICIT_DEF %bswap:_(s88) = G_BSWAP %val diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir @@ -609,48 +609,24 @@ ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[DEF]](s32) ; CHECK: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 - ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C3]], [[C4]] - ; CHECK: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C3]] - ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C3]](s64), [[C4]] - ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP]](s32) - ; CHECK: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C3]](s64), [[C]] - ; CHECK: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP1]](s32) ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[MV]], [[C3]](s64) - ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[SUB1]](s64) ; CHECK: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[DEF1]], [[C3]](s64) - ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]] - ; CHECK: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[MV]], [[SUB]](s64) - ; CHECK: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[SHL]], [[C]] - ; CHECK: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC]](s1), [[OR]], [[SHL2]] - ; CHECK: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC1]](s1), [[DEF1]], [[SELECT1]] - ; CHECK: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SELECT]], [[ZEXTLOAD]] - ; CHECK: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SELECT2]], [[C]] - ; CHECK: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]] - ; CHECK: [[SUB3:%[0-9]+]]:_(s64) = G_SUB [[C4]], [[C4]] - ; CHECK: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[C4]](s64), [[C4]] - ; CHECK: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP2]](s32) - ; CHECK: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[C4]](s64), [[C]] - ; CHECK: [[TRUNC3:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP3]](s32) - ; CHECK: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[OR1]], [[C4]](s64) - ; CHECK: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[OR1]], [[SUB3]](s64) - ; CHECK: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[OR2]], [[C4]](s64) - ; CHECK: [[OR3:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL4]] - ; CHECK: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[OR1]], [[SUB2]](s64) - ; CHECK: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC2]](s1), [[SHL3]], [[C]] - ; CHECK: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC2]](s1), [[OR3]], [[SHL5]] - ; CHECK: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[TRUNC3]](s1), [[OR2]], [[SELECT4]] - ; CHECK: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT3]], [[LOAD]] - ; CHECK: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[C]] - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY [[OR4]](s64) - ; CHECK: [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[OR5]](s64) + ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[C4]](s64) + ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]] + ; CHECK: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXTLOAD]] + ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; CHECK: [[OR2:%[0-9]+]]:_(s64) = G_OR [[C]], [[LOAD]] + ; CHECK: [[OR3:%[0-9]+]]:_(s64) = G_OR [[OR1]], [[C]] + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY [[OR2]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[OR3]](s64) ; CHECK: G_STORE [[COPY]](s64), %ptr(p0) :: (store (s64), align 16) ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64) - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC4]](s32) - ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C3]](s64) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C3]](s64) ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD2]], [[C2]](s64) ; CHECK: G_STORE [[COPY1]](s32), [[PTR_ADD2]](p0) :: (store (s16) into unknown-address + 8, align 8) - ; CHECK: G_STORE [[LSHR2]](s32), [[PTR_ADD3]](p0) :: (store (s8) into unknown-address + 10, align 2) + ; CHECK: G_STORE [[LSHR1]](s32), [[PTR_ADD3]](p0) :: (store (s8) into unknown-address + 10, align 2) ; CHECK: RET_ReallyLR %ptr:_(p0) = COPY $x0 %load:_(s88) = G_LOAD %ptr(p0) :: (load (s88)) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -4688,26 +4688,26 @@ ; GFX6-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 -; GFX6-NEXT: s_lshl_b32 s7, s6, 31 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX6-NEXT: s_mov_b32 s6, s11 -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX6-NEXT: s_lshl_b32 s5, s6, 31 +; GFX6-NEXT: s_mov_b32 s4, s11 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX6-NEXT: s_sub_i32 s12, s8, 64 ; GFX6-NEXT: s_sub_i32 s10, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 -; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s12 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] @@ -4735,26 +4735,26 @@ ; GFX8-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 -; GFX8-NEXT: s_lshl_b32 s7, s6, 31 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX8-NEXT: s_mov_b32 s6, s11 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX8-NEXT: s_lshl_b32 s5, s6, 31 +; GFX8-NEXT: s_mov_b32 s4, s11 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX8-NEXT: s_sub_i32 s12, s8, 64 ; GFX8-NEXT: s_sub_i32 s10, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s12 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] @@ -4782,26 +4782,26 @@ ; GFX9-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 -; GFX9-NEXT: s_lshl_b32 s7, s6, 31 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX9-NEXT: s_mov_b32 s6, s11 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX9-NEXT: s_lshl_b32 s5, s6, 31 +; GFX9-NEXT: s_mov_b32 s4, s11 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX9-NEXT: s_sub_i32 s12, s8, 64 ; GFX9-NEXT: s_sub_i32 s10, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s12 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] @@ -4832,21 +4832,21 @@ ; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX10-NEXT: s_lshl_b32 s5, s6, 31 ; GFX10-NEXT: s_mov_b32 s4, s11 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_sub_i32 s14, s8, 64 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX10-NEXT: s_sub_i32 s9, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s9 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 @@ -4882,24 +4882,24 @@ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; GFX6-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc -; GFX6-NEXT: v_lshr_b64 v[2:3], v[4:5], 1 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v6 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v15 ; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v15 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[2:3], v15 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v15 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], v14 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v15 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v15 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v14 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX6-NEXT: v_or_b32_e32 v0, v10, v0 @@ -4931,24 +4931,24 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[4:5] -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7] -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v15 ; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v15 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v15, v[0:1] -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v14, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 @@ -4980,24 +4980,24 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[4:5] -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7] -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v6 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v6 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] ; GFX9-NEXT: v_sub_u32_e32 v6, 64, v15 ; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v15 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v15, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v14, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX9-NEXT: v_or_b32_e32 v0, v10, v0 @@ -5083,26 +5083,26 @@ ; GFX6-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[4:5], 1 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX6-NEXT: s_lshl_b32 s9, s6, 31 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7 -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v7 -; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v2 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7 +; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2 ; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v11 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v11 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[0:1], v7 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc @@ -5136,26 +5136,26 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[4:5], 1 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX8-NEXT: s_lshl_b32 s9, s6, 31 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7 -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] ; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v7 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc @@ -5188,27 +5188,27 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 1 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_lshl_b32 s9, s6, 31 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] ; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v7 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc @@ -5295,33 +5295,33 @@ ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] -; GFX6-NEXT: v_lshr_b64 v[3:4], v[2:3], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v2 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: s_sub_i32 s3, 64, s4 ; GFX6-NEXT: s_sub_i32 s2, s4, 64 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: s_cmp_lt_u32 s4, 64 ; GFX6-NEXT: s_cselect_b32 s5, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s4, 0 -; GFX6-NEXT: v_lshr_b64 v[5:6], v[0:1], s4 -; GFX6-NEXT: v_lshl_b64 v[7:8], v[3:4], s3 -; GFX6-NEXT: v_lshr_b64 v[9:10], v[3:4], s4 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s4 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s3 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s4 ; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[3:4], s2 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s2 ; GFX6-NEXT: s_and_b32 s2, 1, s5 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX6-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX6-NEXT: s_and_b32 s2, 1, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX6-NEXT: s_and_b32 s2, 1, s5 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v9, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX6-NEXT: v_or_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 @@ -5350,33 +5350,33 @@ ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] ; GFX8-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: v_lshrrev_b64 v[3:4], 1, v[2:3] -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v2 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: s_sub_i32 s3, 64, s4 ; GFX8-NEXT: s_sub_i32 s2, s4, 64 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_cmp_lt_u32 s4, 64 ; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s4, 0 -; GFX8-NEXT: v_lshrrev_b64 v[5:6], s4, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[7:8], s3, v[3:4] -; GFX8-NEXT: v_lshrrev_b64 v[9:10], s4, v[3:4] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s3, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], s2, v[3:4] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], s2, v[2:3] ; GFX8-NEXT: s_and_b32 s2, 1, s5 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX8-NEXT: s_and_b32 s2, 1, s8 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: s_and_b32 s2, 1, s5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX8-NEXT: v_or_b32_e32 v0, s6, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 @@ -5405,33 +5405,33 @@ ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] ; GFX9-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[3:4], 1, v[2:3] -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v2 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: s_sub_i32 s3, 64, s4 ; GFX9-NEXT: s_sub_i32 s2, s4, 64 -; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: v_lshrrev_b64 v[5:6], s4, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[7:8], s3, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[9:10], s4, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s3, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[2:3], s2, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], s2, v[2:3] ; GFX9-NEXT: s_and_b32 s2, 1, s5 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX9-NEXT: s_and_b32 s2, 1, s8 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX9-NEXT: s_and_b32 s2, 1, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX9-NEXT: v_or_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX9-NEXT: v_or_b32_e32 v2, s0, v2 @@ -5512,15 +5512,15 @@ ; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5 ; GFX6-NEXT: s_and_b32 s5, 1, s9 -; GFX6-NEXT: s_lshr_b64 s[8:9], s[2:3], 1 -; GFX6-NEXT: s_lshl_b32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b32 s9, s2, 31 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: s_mov_b32 s2, s7 +; GFX6-NEXT: s_mov_b32 s8, s7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX6-NEXT: s_and_b32 s5, 1, s10 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_sub_i32 s10, s4, 64 -; GFX6-NEXT: s_sub_i32 s6, 64, s4 +; GFX6-NEXT: s_sub_i32 s8, 64, s4 ; GFX6-NEXT: s_cmp_lt_u32 s4, 64 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 @@ -5532,19 +5532,19 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], s4 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s6 -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_lshr_b64 s[6:7], s[8:9], s10 +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 @@ -5567,15 +5567,15 @@ ; GFX8-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX8-NEXT: s_and_b32 s5, 1, s9 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[2:3], 1 -; GFX8-NEXT: s_lshl_b32 s3, s2, 31 +; GFX8-NEXT: s_lshl_b32 s9, s2, 31 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: s_mov_b32 s2, s7 +; GFX8-NEXT: s_mov_b32 s8, s7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX8-NEXT: s_and_b32 s5, 1, s10 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_sub_i32 s10, s4, 64 -; GFX8-NEXT: s_sub_i32 s6, 64, s4 +; GFX8-NEXT: s_sub_i32 s8, 64, s4 ; GFX8-NEXT: s_cmp_lt_u32 s4, 64 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 @@ -5587,19 +5587,19 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[8:9], s4 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[8:9], s6 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_lshr_b64 s[6:7], s[8:9], s10 +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 @@ -5622,15 +5622,15 @@ ; GFX9-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX9-NEXT: s_and_b32 s5, 1, s9 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[2:3], 1 -; GFX9-NEXT: s_lshl_b32 s3, s2, 31 +; GFX9-NEXT: s_lshl_b32 s9, s2, 31 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: s_mov_b32 s2, s7 +; GFX9-NEXT: s_mov_b32 s8, s7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX9-NEXT: s_and_b32 s5, 1, s10 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_sub_i32 s10, s4, 64 -; GFX9-NEXT: s_sub_i32 s6, 64, s4 +; GFX9-NEXT: s_sub_i32 s8, 64, s4 ; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 @@ -5642,19 +5642,19 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s4 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[8:9], s6 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: s_lshr_b64 s[6:7], s[8:9], s10 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 @@ -5723,50 +5723,46 @@ define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_fshl_i128_65: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s9, 0 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[0:1], 1 -; GFX6-NEXT: s_lshr_b32 s8, s7, 31 -; GFX6-NEXT: s_lshr_b32 s0, s5, 31 -; GFX6-NEXT: s_mov_b32 s1, s9 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX6-NEXT: s_lshr_b32 s4, s5, 31 +; GFX6-NEXT: s_mov_b32 s5, 0 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 +; GFX6-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX6-NEXT: s_lshr_b32 s4, s7, 31 +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_i128_65: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s9, 0 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[0:1], 1 -; GFX8-NEXT: s_lshr_b32 s8, s7, 31 -; GFX8-NEXT: s_lshr_b32 s0, s5, 31 -; GFX8-NEXT: s_mov_b32 s1, s9 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX8-NEXT: s_lshr_b32 s4, s5, 31 +; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 +; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX8-NEXT: s_lshr_b32 s4, s7, 31 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i128_65: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s9, 0 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 1 -; GFX9-NEXT: s_lshr_b32 s8, s7, 31 -; GFX9-NEXT: s_lshr_b32 s0, s5, 31 -; GFX9-NEXT: s_mov_b32 s1, s9 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX9-NEXT: s_lshr_b32 s4, s5, 31 +; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 +; GFX9-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_lshr_b32 s4, s7, 31 +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i128_65: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 1 -; GFX10-NEXT: s_lshr_b32 s0, s5, 31 -; GFX10-NEXT: s_lshr_b32 s8, s7, 31 -; GFX10-NEXT: s_mov_b32 s1, s9 +; GFX10-NEXT: s_lshr_b32 s2, s5, 31 +; GFX10-NEXT: s_mov_b32 s3, 0 ; GFX10-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX10-NEXT: s_lshr_b32 s2, s7, 31 +; GFX10-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result @@ -5778,9 +5774,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshl_b64 v[2:3], v[0:1], 1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 +; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 31, v5 -; GFX6-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -5789,9 +5785,9 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 31, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5800,9 +5796,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7] +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 +; GFX9-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v5 -; GFX9-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -5843,26 +5839,26 @@ ; GFX6-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] ; GFX6-NEXT: s_cmp_lg_u32 s29, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], 1 -; GFX6-NEXT: s_lshl_b32 s11, s10, 31 -; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], 1 -; GFX6-NEXT: s_mov_b32 s10, s19 -; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; GFX6-NEXT: s_lshl_b32 s9, s10, 31 +; GFX6-NEXT: s_mov_b32 s8, s19 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX6-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 ; GFX6-NEXT: s_sub_i32 s26, s16, 64 ; GFX6-NEXT: s_sub_i32 s22, 64, s16 ; GFX6-NEXT: s_cmp_lt_u32 s16, 64 ; GFX6-NEXT: s_cselect_b32 s27, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s16, 0 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 -; GFX6-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 +; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 ; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] -; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s26 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] +; GFX6-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s28, 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] @@ -5881,30 +5877,30 @@ ; GFX6-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX6-NEXT: s_cmp_lg_u32 s22, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 -; GFX6-NEXT: s_lshl_b32 s13, s14, 31 -; GFX6-NEXT: s_mov_b32 s12, s19 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], 1 -; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX6-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 +; GFX6-NEXT: s_lshl_b32 s9, s14, 31 +; GFX6-NEXT: s_mov_b32 s8, s19 +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 ; GFX6-NEXT: s_sub_i32 s18, s10, 64 ; GFX6-NEXT: s_sub_i32 s14, 64, s10 ; GFX6-NEXT: s_cmp_lt_u32 s10, 64 ; GFX6-NEXT: s_cselect_b32 s19, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s10, 0 ; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[12:13], s[4:5], s10 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[4:5], s14 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX6-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 ; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s18 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 ; GFX6-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] @@ -5932,26 +5928,26 @@ ; GFX8-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] ; GFX8-NEXT: s_cmp_lg_u32 s29, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], 1 -; GFX8-NEXT: s_lshl_b32 s11, s10, 31 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], 1 -; GFX8-NEXT: s_mov_b32 s10, s19 -; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; GFX8-NEXT: s_lshl_b32 s9, s10, 31 +; GFX8-NEXT: s_mov_b32 s8, s19 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 ; GFX8-NEXT: s_sub_i32 s26, s16, 64 ; GFX8-NEXT: s_sub_i32 s22, 64, s16 ; GFX8-NEXT: s_cmp_lt_u32 s16, 64 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s16, 0 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 -; GFX8-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 ; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] -; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s26 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] +; GFX8-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s28, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] @@ -5970,30 +5966,30 @@ ; GFX8-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] ; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 ; GFX8-NEXT: s_cmp_lg_u32 s18, 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX8-NEXT: s_cmp_lg_u32 s22, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 -; GFX8-NEXT: s_lshl_b32 s13, s14, 31 -; GFX8-NEXT: s_mov_b32 s12, s19 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], 1 -; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX8-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 +; GFX8-NEXT: s_lshl_b32 s9, s14, 31 +; GFX8-NEXT: s_mov_b32 s8, s19 +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 ; GFX8-NEXT: s_sub_i32 s18, s10, 64 ; GFX8-NEXT: s_sub_i32 s14, 64, s10 ; GFX8-NEXT: s_cmp_lt_u32 s10, 64 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s10, 0 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[12:13], s[4:5], s10 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[4:5], s14 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 +; GFX8-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 ; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s18 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s20, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 ; GFX8-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] @@ -6021,26 +6017,26 @@ ; GFX9-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] ; GFX9-NEXT: s_cmp_lg_u32 s29, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], 1 -; GFX9-NEXT: s_lshl_b32 s11, s10, 31 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], 1 -; GFX9-NEXT: s_mov_b32 s10, s19 -; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; GFX9-NEXT: s_lshl_b32 s9, s10, 31 +; GFX9-NEXT: s_mov_b32 s8, s19 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 ; GFX9-NEXT: s_sub_i32 s26, s16, 64 ; GFX9-NEXT: s_sub_i32 s22, 64, s16 ; GFX9-NEXT: s_cmp_lt_u32 s16, 64 ; GFX9-NEXT: s_cselect_b32 s27, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s16, 0 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 -; GFX9-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 +; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 ; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s26 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] +; GFX9-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] @@ -6059,30 +6055,30 @@ ; GFX9-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 ; GFX9-NEXT: s_cmp_lg_u32 s22, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 -; GFX9-NEXT: s_lshl_b32 s13, s14, 31 -; GFX9-NEXT: s_mov_b32 s12, s19 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 1 -; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 +; GFX9-NEXT: s_lshl_b32 s9, s14, 31 +; GFX9-NEXT: s_mov_b32 s8, s19 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 ; GFX9-NEXT: s_sub_i32 s18, s10, 64 ; GFX9-NEXT: s_sub_i32 s14, 64, s10 ; GFX9-NEXT: s_cmp_lt_u32 s10, 64 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s10, 0 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[4:5], s10 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[4:5], s14 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 +; GFX9-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 ; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s18 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 ; GFX9-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] @@ -6113,21 +6109,21 @@ ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 ; GFX10-NEXT: s_lshl_b32 s9, s10, 31 ; GFX10-NEXT: s_mov_b32 s8, s19 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX10-NEXT: s_sub_i32 s26, s16, 64 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 ; GFX10-NEXT: s_sub_i32 s17, 64, s16 ; GFX10-NEXT: s_cmp_lt_u32 s16, 64 ; GFX10-NEXT: s_cselect_b32 s27, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s16, 0 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s16 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s17 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[10:11], s16 -; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 @@ -6155,21 +6151,21 @@ ; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 ; GFX10-NEXT: s_lshl_b32 s13, s14, 31 ; GFX10-NEXT: s_mov_b32 s12, s19 -; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], 1 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] ; GFX10-NEXT: s_sub_i32 s18, s10, 64 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] +; GFX10-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 ; GFX10-NEXT: s_sub_i32 s11, 64, s10 ; GFX10-NEXT: s_cmp_lt_u32 s10, 64 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s10, 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[12:13], s[4:5], s10 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[14:15], s11 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[14:15], s10 -; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17] -; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX10-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[12:13], s11 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], s10 +; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s18 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[12:13], s[12:13], s[14:15] +; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13] ; GFX10-NEXT: s_cmp_lg_u32 s20, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13] ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 @@ -6248,24 +6244,24 @@ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX6-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc -; GFX6-NEXT: v_lshr_b64 v[6:7], v[12:13], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 31, v14 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], 1 -; GFX6-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v17 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v17 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[4:5], v10 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v17 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, 64, v17 ; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v17 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v12 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v17 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX6-NEXT: v_or_b32_e32 v4, v18, v4 @@ -6340,24 +6336,24 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc -; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[12:13] -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 31, v14 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[14:15] -; GFX8-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v17 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] ; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, 64, v17 ; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v12, v[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX8-NEXT: v_or_b32_e32 v4, v18, v4 @@ -6425,31 +6421,31 @@ ; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v18, v[4:5] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[12:13] -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 31, v14 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[14:15] -; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v17 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] ; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v17 ; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v12, v[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX9-NEXT: v_or_b32_e32 v4, v18, v4 @@ -6484,14 +6480,14 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 ; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] ; GFX10-NEXT: v_or_b32_e32 v23, v23, v25 ; GFX10-NEXT: v_or_b32_e32 v24, v24, v26 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 ; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v24, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo @@ -6499,35 +6495,35 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v8, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 ; GFX10-NEXT: v_and_b32_e32 v23, s7, v20 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v20 -; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4 ; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4 ; GFX10-NEXT: v_or_b32_e32 v1, v11, v8 +; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v23 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v23 -; GFX10-NEXT: v_and_b32_e32 v25, s7, v10 +; GFX10-NEXT: v_and_b32_e32 v25, s7, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 31, v14 +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v9, v9, v16 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v23 ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v23, v[4:5] -; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 ; GFX10-NEXT: v_or_b32_e32 v12, v10, v12 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v25 ; GFX10-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9] ; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 ; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] ; GFX10-NEXT: v_or_b32_e32 v5, v11, v13 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] -; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v25 +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -4762,10 +4762,10 @@ ; GFX6-NEXT: s_mov_b32 s11, 0 ; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] -; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s10, s1, 31 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11] ; GFX6-NEXT: s_sub_i32 s13, s8, 64 ; GFX6-NEXT: s_sub_i32 s9, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 @@ -4809,10 +4809,10 @@ ; GFX8-NEXT: s_mov_b32 s11, 0 ; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] -; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s10, s1, 31 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11] ; GFX8-NEXT: s_sub_i32 s13, s8, 64 ; GFX8-NEXT: s_sub_i32 s9, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 @@ -4856,10 +4856,10 @@ ; GFX9-NEXT: s_mov_b32 s11, 0 ; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] ; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] -; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s10, s1, 31 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11] ; GFX9-NEXT: s_sub_i32 s13, s8, 64 ; GFX9-NEXT: s_sub_i32 s9, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 @@ -4906,7 +4906,7 @@ ; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] ; GFX10-NEXT: s_lshr_b32 s10, s1, 31 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[10:11], s[2:3] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] ; GFX10-NEXT: s_sub_i32 s13, s8, 64 ; GFX10-NEXT: s_sub_i32 s9, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 @@ -4958,7 +4958,7 @@ ; GFX6-NEXT: v_and_b32_e32 v15, s4, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v0 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v15 @@ -5007,7 +5007,7 @@ ; GFX8-NEXT: v_and_b32_e32 v15, s4, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3] @@ -5056,7 +5056,7 @@ ; GFX9-NEXT: v_and_b32_e32 v15, s4, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3] @@ -5106,7 +5106,7 @@ ; GFX10-NEXT: v_and_b32_e32 v19, s4, v8 ; GFX10-NEXT: v_and_b32_e32 v18, s4, v9 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 ; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 ; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v18 @@ -5155,12 +5155,12 @@ ; GFX6-NEXT: v_and_b32_e32 v6, s8, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX6-NEXT: s_lshr_b32 s8, s1, 31 ; GFX6-NEXT: s_mov_b32 s9, 0 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX6-NEXT: s_lshr_b32 s8, s1, 31 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v7 -; GFX6-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX6-NEXT: v_lshr_b64 v[0:1], s[10:11], v0 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v7 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 @@ -5208,12 +5208,12 @@ ; GFX8-NEXT: v_and_b32_e32 v6, s8, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX8-NEXT: s_lshr_b32 s8, s1, 31 ; GFX8-NEXT: s_mov_b32 s9, 0 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX8-NEXT: s_lshr_b32 s8, s1, 31 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v7 -; GFX8-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 @@ -5261,12 +5261,12 @@ ; GFX9-NEXT: v_and_b32_e32 v6, s8, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX9-NEXT: s_lshr_b32 s8, s1, 31 ; GFX9-NEXT: s_mov_b32 s9, 0 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_lshr_b32 s8, s1, 31 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v7 -; GFX9-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] ; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 @@ -5319,7 +5319,7 @@ ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 -; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[2:3] +; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] @@ -5368,10 +5368,10 @@ ; GFX6-NEXT: s_mov_b32 s7, 0 ; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s6, s1, 31 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] ; GFX6-NEXT: s_sub_i32 s9, s4, 64 ; GFX6-NEXT: s_sub_i32 s5, 64, s4 ; GFX6-NEXT: s_cmp_lt_u32 s4, 64 @@ -5424,10 +5424,10 @@ ; GFX8-NEXT: s_mov_b32 s7, 0 ; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s6, s1, 31 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] ; GFX8-NEXT: s_sub_i32 s9, s4, 64 ; GFX8-NEXT: s_sub_i32 s5, 64, s4 ; GFX8-NEXT: s_cmp_lt_u32 s4, 64 @@ -5480,10 +5480,10 @@ ; GFX9-NEXT: s_mov_b32 s7, 0 ; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s6, s1, 31 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] ; GFX9-NEXT: s_sub_i32 s9, s4, 64 ; GFX9-NEXT: s_sub_i32 s5, 64, s4 ; GFX9-NEXT: s_cmp_lt_u32 s4, 64 @@ -5539,7 +5539,7 @@ ; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; GFX10-NEXT: s_lshr_b32 s6, s1, 31 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX10-NEXT: s_sub_i32 s9, s4, 64 ; GFX10-NEXT: s_sub_i32 s5, 64, s4 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 @@ -5600,7 +5600,7 @@ ; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: s_cmp_lt_u32 s4, 64 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX6-NEXT: s_cselect_b32 s7, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s4, 0 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 @@ -5654,7 +5654,7 @@ ; GFX8-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: s_cmp_lt_u32 s4, 64 -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX8-NEXT: s_cselect_b32 s7, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s4, 0 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 @@ -5708,7 +5708,7 @@ ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: s_cmp_lt_u32 s4, 64 -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-NEXT: s_cselect_b32 s7, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 @@ -5759,7 +5759,7 @@ ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: s_andn2_b64 s[8:9], s[6:7], s[4:5] ; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] -; GFX10-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX10-NEXT: s_sub_i32 s4, 64, s8 ; GFX10-NEXT: s_sub_i32 s5, s8, 64 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 @@ -5815,9 +5815,9 @@ ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: s_lshl_b32 s5, s0, 31 ; GFX6-NEXT: s_lshl_b32 s3, s2, 31 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX6-NEXT: s_mov_b32 s2, s4 -; GFX6-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 ; GFX6-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] ; GFX6-NEXT: ; return to shader part epilog @@ -5827,9 +5827,9 @@ ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: s_lshl_b32 s5, s0, 31 ; GFX8-NEXT: s_lshl_b32 s3, s2, 31 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX8-NEXT: s_mov_b32 s2, s4 -; GFX8-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 ; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] ; GFX8-NEXT: ; return to shader part epilog @@ -5839,9 +5839,9 @@ ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_lshl_b32 s5, s0, 31 ; GFX9-NEXT: s_lshl_b32 s3, s2, 31 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 ; GFX9-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] ; GFX9-NEXT: ; return to shader part epilog @@ -5851,11 +5851,11 @@ ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_lshl_b32 s3, s2, 31 ; GFX10-NEXT: s_lshl_b32 s5, s0, 31 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], 1 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], 1 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7] ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result @@ -5865,37 +5865,34 @@ ; GFX6-LABEL: v_fshr_i128_65: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 31, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 31, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[0:1], 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v0 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v4 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX6-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_i128_65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 31, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 31, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v0 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7] -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v4 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_i128_65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 31, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 31, v2 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7] -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX9-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_i128_65: @@ -5908,7 +5905,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 31, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 31, v8 ; GFX10-NEXT: v_or_b32_e32 v1, v9, v5 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v0 +; GFX10-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65) @@ -5924,9 +5921,9 @@ ; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 -; GFX6-NEXT: s_mov_b32 s1, s19 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: s_mov_b32 s1, s19 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: s_sub_i32 s23, s16, 64 ; GFX6-NEXT: s_sub_i32 s17, 64, s16 ; GFX6-NEXT: s_cmp_lt_u32 s16, 64 @@ -5964,10 +5961,10 @@ ; GFX6-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] ; GFX6-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX6-NEXT: s_lshr_b32 s18, s5, 31 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX6-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5] +; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 +; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[18:19] ; GFX6-NEXT: s_sub_i32 s9, s10, 64 ; GFX6-NEXT: s_sub_i32 s11, 64, s10 ; GFX6-NEXT: s_cmp_lt_u32 s10, 64 @@ -6013,9 +6010,9 @@ ; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 -; GFX8-NEXT: s_mov_b32 s1, s19 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_mov_b32 s1, s19 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: s_sub_i32 s23, s16, 64 ; GFX8-NEXT: s_sub_i32 s17, 64, s16 ; GFX8-NEXT: s_cmp_lt_u32 s16, 64 @@ -6053,10 +6050,10 @@ ; GFX8-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] ; GFX8-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX8-NEXT: s_lshr_b32 s18, s5, 31 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX8-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5] +; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 +; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[18:19] ; GFX8-NEXT: s_sub_i32 s9, s10, 64 ; GFX8-NEXT: s_sub_i32 s11, 64, s10 ; GFX8-NEXT: s_cmp_lt_u32 s10, 64 @@ -6102,9 +6099,9 @@ ; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 -; GFX9-NEXT: s_mov_b32 s1, s19 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_mov_b32 s1, s19 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: s_sub_i32 s23, s16, 64 ; GFX9-NEXT: s_sub_i32 s17, 64, s16 ; GFX9-NEXT: s_cmp_lt_u32 s16, 64 @@ -6142,10 +6139,10 @@ ; GFX9-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] ; GFX9-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX9-NEXT: s_lshr_b32 s18, s5, 31 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX9-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5] +; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[18:19] ; GFX9-NEXT: s_sub_i32 s9, s10, 64 ; GFX9-NEXT: s_sub_i32 s11, 64, s10 ; GFX9-NEXT: s_cmp_lt_u32 s10, 64 @@ -6190,10 +6187,10 @@ ; GFX10-NEXT: s_lshr_b32 s24, s1, 31 ; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX10-NEXT: s_mov_b32 s25, s19 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_mov_b32 s25, s19 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[24:25], s[2:3] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] ; GFX10-NEXT: s_sub_i32 s23, s16, 64 ; GFX10-NEXT: s_sub_i32 s17, 64, s16 ; GFX10-NEXT: s_cmp_lt_u32 s16, 64 @@ -6234,7 +6231,7 @@ ; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 -; GFX10-NEXT: s_or_b64 s[6:7], s[18:19], s[6:7] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] ; GFX10-NEXT: s_sub_i32 s9, s10, 64 ; GFX10-NEXT: s_sub_i32 s11, 64, s10 ; GFX10-NEXT: s_cmp_lt_u32 s10, 64 @@ -6285,7 +6282,7 @@ ; GFX6-NEXT: v_and_b32_e32 v23, s6, v17 ; GFX6-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: v_or_b32_e32 v2, v17, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 ; GFX6-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 @@ -6328,7 +6325,7 @@ ; GFX6-NEXT: v_and_b32_e32 v17, s6, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v17 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v17 @@ -6377,7 +6374,7 @@ ; GFX8-NEXT: v_and_b32_e32 v23, s6, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: v_or_b32_e32 v2, v17, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 ; GFX8-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] @@ -6420,7 +6417,7 @@ ; GFX8-NEXT: v_and_b32_e32 v17, s6, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v17 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] @@ -6469,7 +6466,7 @@ ; GFX9-NEXT: v_and_b32_e32 v23, s6, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v2, v17, v2 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 ; GFX9-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] @@ -6512,7 +6509,7 @@ ; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 ; GFX9-NEXT: v_sub_u32_e32 v4, 64, v17 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v17, v[6:7] @@ -6567,7 +6564,7 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v26 ; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v25 -; GFX10-NEXT: v_or_b32_e32 v2, v17, v2 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX10-NEXT: v_subrev_nc_u32_e32 v19, 64, v25 ; GFX10-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 @@ -6600,7 +6597,7 @@ ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 -; GFX10-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v25 ; GFX10-NEXT: v_and_b32_e32 v23, s5, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4 diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -4054,4 +4054,47 @@ EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } +// Test narror scalar of G_SHL with constant shift amount +TEST_F(AArch64GISelMITest, narrowScalarShiftByConstant) { + setUp(); + if (!TM) + return; + + DefineLegalizerInfo(A, {}); + + LLT S64{LLT::scalar(64)}; + LLT S32{LLT::scalar(32)}; + + auto Constant = B.buildConstant(S64, 33); + auto Trunc = B.buildTrunc(S32, Constant); + auto Shift = B.buildShl(S64, Copies[0], Trunc); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + + // Perform Legalization + B.setInsertPt(*EntryMBB, Shift->getIterator()); + + // This should detect the G_CONSTANT feeding the G_SHL through a G_TRUNC + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.narrowScalarShift(*Shift, 0, S32)); + + const auto *CheckStr = R"( + CHECK: [[COPY0:%[0-9]+]]:_(s64) = COPY + CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY + CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY + CHECK: [[THIRTY3:%[0-9]+]]:_(s64) = G_CONSTANT i64 33 + CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %4:_(s64) + CHECK: [[UNMERGE:%[0-9]+]]:_(s32), [[UNMERGE2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY0]] + CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + CHECK: [[ONE:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + CHECK: [[SHIFT:%[0-9]+]]:_(s32) = G_SHL [[UNMERGE]]:_, [[ONE]]:_(s32) + CHECK: [[MERGE:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ZERO]]:_(s32), [[SHIFT]]:_(s32) + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + } // namespace