Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1833,13 +1833,13 @@ case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; - case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; - case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; - case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; - case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; - case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; - case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; - case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; + case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; + case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; + case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; + case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; + case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; + case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; + case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1870,7 +1870,7 @@ def : Pat < (fneg (fabs f32:$src)), - (S_OR_B32 $src, 0x80000000) // Set sign bit + (S_OR_B32 $src, (S_MOV_B32 0x80000000)) // Set sign bit >; // FIXME: Should use S_OR_B32 Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -144,12 +144,6 @@ TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) return; - // Literal constants and SGPRs can only be used in Src0, so if Src0 is an - // SGPR, we cannot commute the instruction, so we can't fold any literal - // constants. - if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) - return; - // Try to fold Src0 if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) { unsigned Reg = Src0.getReg(); @@ -158,7 +152,8 @@ MachineOperand &MovSrc = Def->getOperand(1); bool ConstantFolded = false; - if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { + if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || + isUInt<32>(MovSrc.getImm()))) { Src0.ChangeToImmediate(MovSrc.getImm()); ConstantFolded = true; } Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -143,7 +143,7 @@ ; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] ; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]] -; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]] +; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]] ; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]] ; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc ; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} Index: test/CodeGen/AMDGPU/fneg-fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.ll +++ test/CodeGen/AMDGPU/fneg-fabs.ll @@ -82,8 +82,10 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: -PV -; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} +; FIXME: In this case two uses of the constant should be folded +; SI: s_mov_b32 [[SIGNBITK:s[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs @@ -92,10 +94,11 @@ } ; FUNC-LABEL: {{^}}fneg_fabs_v4f32: -; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} +; SI: s_mov_b32 [[SIGNBITK:s[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -16,7 +16,7 @@ ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 ; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] -; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] +; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]] ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { @@ -440,7 +440,7 @@ ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] -; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] +; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] ; GCN-DAG: buffer_store_dword [[PACKED]] ; GCN: s_endpgm define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -10,9 +10,7 @@ ; VI: s_load_dword [[SRC:s[0-9]+]] ; VI-DAG: v_rsq_f32_e32 [[RSQ:v[0-9]+]], [[SRC]] ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] -; TODO: this constant should be folded: -; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff7fffff -; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[K]] +; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]] ; VI: buffer_store_dword [[RESULT]] define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 { %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src) Index: test/CodeGen/AMDGPU/or.ll =================================================================== --- test/CodeGen/AMDGPU/or.ll +++ test/CodeGen/AMDGPU/or.ll @@ -113,11 +113,9 @@ } ; FUNC-LABEL: {{^}}vector_or_i64_loadimm: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f -; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]] ; SI: s_endpgm define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 8 Index: test/CodeGen/AMDGPU/s_movk_i32.ll =================================================================== --- test/CodeGen/AMDGPU/s_movk_i32.ll +++ test/CodeGen/AMDGPU/s_movk_i32.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}s_movk_i32_k0: @@ -11,6 +11,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32) store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 4295032831) ret void } @@ -24,6 +25,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32) store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 4295000063) ret void } @@ -37,6 +39,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32) store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 274877939711) ret void } @@ -50,6 +53,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32) store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 4295000064) ret void } @@ -63,6 +67,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32) store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 4295098368) ret void } @@ -77,6 +82,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 18374967954648334319) ret void } @@ -90,6 +96,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 270582939713 ; 65 | (63 << 32) store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 270582939713) ret void } @@ -104,10 +111,10 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32) store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 70368744185856) ret void } - ; SI-LABEL: {{^}}s_movk_i32_k8: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} @@ -119,6 +126,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000 store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 1229782942255906816) ret void } @@ -133,6 +141,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001 store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 1229782942255906817) ret void } @@ -147,6 +156,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888 store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 1229782942255909000) ret void } @@ -161,6 +171,7 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 1229782942255910911) ret void } @@ -175,5 +186,6 @@ %loada = load i64, i64 addrspace(1)* %a, align 4 %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001 store i64 %or, i64 addrspace(1)* %out + call void asm sideeffect "; use $0", "s"(i64 1229782942255902721) ret void } Index: test/CodeGen/AMDGPU/si-literal-folding.ll =================================================================== --- test/CodeGen/AMDGPU/si-literal-folding.ll +++ test/CodeGen/AMDGPU/si-literal-folding.ll @@ -1,9 +1,8 @@ -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0xbf4353f8 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; GCN-LABEL: {{^}}main: +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}} define amdgpu_vs void @main(float) { main_body: %1 = fmul float %0, 0x3FE86A7F00000000