Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -917,15 +917,18 @@ if (VT.isVector()) { EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); - if (Size == 32) - return ScalarVT.getSimpleVT(); + if (Size == 16) { + if (Subtarget->has16BitInsts()) + return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + return VT.isInteger() ? MVT::i32 : MVT::f32; + } - if (Size > 32) - return MVT::i32; + if (Size < 16) + return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; + return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; + } - if (Size == 16 && Subtarget->has16BitInsts()) - return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; - } else if (VT.getSizeInBits() > 32) + if (VT.getSizeInBits() > 32) return MVT::i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); @@ -942,14 +945,15 @@ EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); - if (Size == 32) + // FIXME: Should probably promote 8-bit vectors to i16. + if (Size == 16 && Subtarget->has16BitInsts()) + return (NumElts + 1) / 2; + + if (Size <= 32) return NumElts; if (Size > 32) return NumElts * ((Size + 31) / 32); - - if (Size == 16 && Subtarget->has16BitInsts()) - return (NumElts + 1) / 2; } else if (VT.getSizeInBits() > 32) return (VT.getSizeInBits() + 31) / 32; @@ -964,6 +968,16 @@ unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); + // FIXME: We should fix the ABI to be the same on targets without 16-bit + // support, but unless we can properly handle 3-vectors, it will be still be + // inconsistent. + if (Size == 16 && Subtarget->has16BitInsts()) { + RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + IntermediateVT = RegisterVT; + NumIntermediates = (NumElts + 1) / 2; + return NumIntermediates; + } + if (Size == 32) { RegisterVT = ScalarVT.getSimpleVT(); IntermediateVT = RegisterVT; @@ -971,20 +985,26 @@ return NumIntermediates; } - if (Size > 32) { + if (Size < 16 && Subtarget->has16BitInsts()) { + // FIXME: Should probably form v2i16 pieces + RegisterVT = MVT::i16; + IntermediateVT = ScalarVT; + NumIntermediates = NumElts; + return NumIntermediates; + } + + + if (Size != 16 && Size <= 32) { RegisterVT = MVT::i32; - IntermediateVT = RegisterVT; - NumIntermediates = NumElts * ((Size + 31) / 32); + IntermediateVT = ScalarVT; + NumIntermediates = NumElts; return NumIntermediates; } - // FIXME: We should fix the ABI to be the same on targets without 16-bit - // support, but unless we can properly handle 3-vectors, it will be still be - // inconsistent. - if (Size == 16 && Subtarget->has16BitInsts()) { - RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + if (Size > 32) { + RegisterVT = MVT::i32; IntermediateVT = RegisterVT; - NumIntermediates = (NumElts + 1) / 2; + NumIntermediates = NumElts * ((Size + 31) / 32); return NumIntermediates; } } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll @@ -196,6 +196,89 @@ ret half %val } +define i24 @i24_func_void() #0 { + ; CHECK-LABEL: name: i24_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load 3 from `i24 addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s24) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + %val = load i24, i24 addrspace(1)* undef + ret i24 %val +} + +define zeroext i24 @i24_zeroext_func_void() #0 { + ; CHECK-LABEL: name: i24_zeroext_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load 3 from `i24 addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s24) + ; CHECK: $vgpr0 = COPY [[ZEXT]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + %val = load i24, i24 addrspace(1)* undef + ret i24 %val +} + +define signext i24 @i24_signext_func_void() #0 { + ; CHECK-LABEL: name: i24_signext_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load 3 from `i24 addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s24) + ; CHECK: $vgpr0 = COPY [[SEXT]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + %val = load i24, i24 addrspace(1)* undef + ret i24 %val +} + +define <2 x i24> @v2i24_func_void() #0 { + ; CHECK-LABEL: name: v2i24_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s24>) = G_LOAD [[DEF]](p1) :: (load 6 from `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<2 x s24>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + %val = load <2 x i24>, <2 x i24> addrspace(1)* undef + ret <2 x i24> %val +} + +define <3 x i24> @v3i24_func_void() #0 { + ; CHECK-LABEL: name: v3i24_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(<3 x s24>) = G_LOAD [[DEF]](p1) :: (load 9 from `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24), [[UV2:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<3 x s24>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s24) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + ; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + %val = load <3 x i24>, <3 x i24> addrspace(1)* undef + ret <3 x i24> %val +} + define i32 @i32_func_void() #0 { ; CHECK-LABEL: name: i32_func_void ; CHECK: bb.1 (%ir-block.0): @@ -977,6 +1060,44 @@ ret <16 x i8> %val } +define <2 x i8> @v2i8_func_void() #0 { + ; CHECK-LABEL: name: v2i8_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[DEF]](p1) :: (load 2 from `<2 x i8> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<2 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + %val = load <2 x i8>, <2 x i8> addrspace(1)* undef + ret <2 x i8> %val +} + +define <3 x i8> @v3i8_func_void() #0 { + ; CHECK-LABEL: name: v3i8_func_void + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[DEF]](p1) :: (load 3 from `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<3 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8) + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) + ; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + %val = load <3 x i8>, <3 x i8> addrspace(1)* undef + ret <3 x i8> %val +} + define <4 x i8> @v4i8_func_void() #0 { ; CHECK-LABEL: name: v4i8_func_void ; CHECK: bb.1 (%ir-block.0): Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -553,6 +553,104 @@ ret void } +define void @void_func_v2i24(<2 x i24> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v2i24 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[TRUNC:%[0-9]+]]:_(<2 x s24>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC]](<2 x s24>), [[DEF]](p1) :: (store 6 into `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]] + store <2 x i24> %arg0, <2 x i24> addrspace(1)* undef + ret void +} + +define void @void_func_v3i24(<3 x i24> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v3i24 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[TRUNC:%[0-9]+]]:_(<3 x s24>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC]](<3 x s24>), [[DEF]](p1) :: (store 9 into `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + store <3 x i24> %arg0, <3 x i24> addrspace(1)* undef + ret void +} + +define void @void_func_v2i8(<2 x i8> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v2i8 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[BUILD_VECTOR]](<2 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC2]](<2 x s8>), [[DEF]](p1) :: (store 2 into `<2 x i8> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]] + store <2 x i8> %arg0, <2 x i8> addrspace(1)* undef + ret void +} + +define void @void_func_v3i8(<3 x i8> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v3i8 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[BUILD_VECTOR]](<3 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC3]](<3 x s8>), [[DEF]](p1) :: (store 3 into `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; CHECK: S_SETPC_B64_return [[COPY4]] + store <3 x i8> %arg0, <3 x i8> addrspace(1)* undef + ret void +} + +define void @void_func_v4i8(<4 x i8> %arg0) #0 { + ; CHECK-LABEL: name: void_func_v4i8 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CHECK: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s16>) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: G_STORE [[TRUNC4]](<4 x s8>), [[DEF]](p1) :: (store 4 into `<4 x i8> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]] + store <4 x i8> %arg0, <4 x i8> addrspace(1)* undef + ret void +} + define void @void_func_v2p3i8(<2 x i8 addrspace(3)*> %arg0) #0 { ; CHECK-LABEL: name: void_func_v2p3i8 ; CHECK: bb.1 (%ir-block.0): Index: llvm/test/CodeGen/AMDGPU/call-return-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-return-types.ll +++ llvm/test/CodeGen/AMDGPU/call-return-types.ll @@ -30,6 +30,8 @@ declare <5 x float> @external_v5f32_func_void() #0 declare <2 x double> @external_v2f64_func_void() #0 +declare <2 x i24> @external_v2i24_func_void() #0 + declare <2 x i32> @external_v2i32_func_void() #0 declare <3 x i32> @external_v3i32_func_void() #0 declare <4 x i32> @external_v4i32_func_void() #0 @@ -250,6 +252,18 @@ ret void } +; GCN-LABEL: {{^}}test_call_external_v2i24_func_void: +; GCN: s_swappc_b64 +; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 +define amdgpu_kernel void @test_call_external_v2i24_func_void() #0 { + %val = call <2 x i24> @external_v2i24_func_void() + %elt0 = extractelement <2 x i24> %val, i32 0 + %elt1 = extractelement <2 x i24> %val, i32 1 + %add = add i24 %elt0, %elt1 + store volatile i24 %add, i24 addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}test_call_external_v3f32_func_void: ; GCN: s_swappc ; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] Index: llvm/test/CodeGen/AMDGPU/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fshr.ll +++ llvm/test/CodeGen/AMDGPU/fshr.ll @@ -981,127 +981,61 @@ ; SI-LABEL: v_fshr_v2i24: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 5, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 2, v0 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_hi_u32 v11, v2, s4 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_hi_u32 v12, v3, s4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 4, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 -; SI-NEXT: v_mul_lo_u32 v11, v11, 24 -; SI-NEXT: v_mul_lo_u32 v12, v12, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 -; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 +; SI-NEXT: v_mul_hi_u32 v6, v4, s4 +; SI-NEXT: v_mul_hi_u32 v7, v5, s4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 +; SI-NEXT: v_mul_lo_u32 v6, v6, 24 +; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 +; SI-NEXT: v_mul_lo_u32 v6, v6, 24 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 +; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v6 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, v2 -; SI-NEXT: v_alignbit_b32 v2, v5, v4, v3 -; SI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen -; SI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen -; SI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i24: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab -; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 -; VI-NEXT: v_add_u32_e32 v9, vcc, 5, v0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 2, v0 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_mul_hi_u32 v11, v2, s4 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mul_hi_u32 v12, v3, s4 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v11, 4, v11 -; VI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 -; VI-NEXT: v_mul_lo_u32 v11, v11, 24 -; VI-NEXT: v_mul_lo_u32 v12, v12, 24 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v11 -; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v12 -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 +; VI-NEXT: v_mul_hi_u32 v6, v4, s4 +; VI-NEXT: v_mul_hi_u32 v7, v5, s4 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 +; VI-NEXT: v_mul_lo_u32 v6, v6, 24 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 +; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 +; VI-NEXT: v_mul_lo_u32 v6, v6, 24 +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v6 ; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 -; VI-NEXT: v_alignbit_b32 v1, v1, v6, v2 -; VI-NEXT: v_alignbit_b32 v2, v5, v4, v3 -; VI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen -; VI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen -; VI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen -; VI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_mul_hi_u32 v6, v1, s4 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mul_hi_u32 v7, v2, s4 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4 +; GFX9-NEXT: v_mul_hi_u32 v7, v5, s4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6 -; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 8, v1 -; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_alignbit_b32 v1, v8, v5, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 -; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2 -; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 -; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v7 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v5, v6 +; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v2i24: Index: llvm/test/CodeGen/AMDGPU/function-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/function-args.ll +++ llvm/test/CodeGen/AMDGPU/function-args.ll @@ -344,6 +344,16 @@ ret void } +; GCN-LABEL: {{^}}void_func_v2i24: +; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 +define void @void_func_v2i24(<2 x i24> %arg0) #0 { + %elt0 = extractelement <2 x i24> %arg0, i32 0 + %elt1 = extractelement <2 x i24> %arg0, i32 1 + %add = add i24 %elt0, %elt1 + store i24 %add, i24 addrspace(1)* undef + ret void +} + ; GCN-LABEL: {{^}}void_func_v2f32: ; GCN-NOT: v[0:1] ; GCN-NOT: v0