Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -62,6 +62,10 @@ addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + // TODO: Subtarget feature for i16 + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + computeRegisterProperties(STI.getRegisterInfo()); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); @@ -254,6 +258,64 @@ setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + setOperationAction(ISD::Constant, MVT::i16, Legal); + + setOperationAction(ISD::ADD, MVT::i16, Legal); + setOperationAction(ISD::SUB, MVT::i16, Legal); + setOperationAction(ISD::SHL, MVT::i16, Legal); + setOperationAction(ISD::SRL, MVT::i16, Legal); + setOperationAction(ISD::SRA, MVT::i16, Legal); + + setOperationAction(ISD::SMIN, MVT::i16, Legal); + setOperationAction(ISD::SMAX, MVT::i16, Legal); + setOperationAction(ISD::UMIN, MVT::i16, Legal); + setOperationAction(ISD::UMAX, MVT::i16, Legal); + + setOperationAction(ISD::SETCC, MVT::i16, Legal); + setOperationAction(ISD::TRUNCATE, MVT::i16, Legal); + + setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); + AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); + + setOperationAction(ISD::AND, MVT::i16, Promote); + setOperationAction(ISD::OR, MVT::i16, Promote); + setOperationAction(ISD::XOR, MVT::i16, Promote); + + setOperationAction(ISD::ROTR, MVT::i16, Promote); + setOperationAction(ISD::ROTL, MVT::i16, Promote); + + setOperationAction(ISD::SDIV, MVT::i16, Promote); + setOperationAction(ISD::UDIV, MVT::i16, Promote); + setOperationAction(ISD::SREM, MVT::i16, Promote); + setOperationAction(ISD::UREM, MVT::i16, Promote); + setOperationAction(ISD::MUL, MVT::i16, Promote); + + setOperationAction(ISD::BSWAP, MVT::i16, Promote); + setOperationAction(ISD::CTTZ, MVT::i16, Promote); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); + setOperationAction(ISD::CTLZ, MVT::i16, Promote); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); + + setOperationAction(ISD::SELECT, MVT::i16, Legal); + setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); + + setOperationAction(ISD::BR_CC, MVT::i16, Expand); + + setOperationAction(ISD::LOAD, MVT::i16, Custom); + setOperationAction(ISD::STORE, MVT::i16, Custom); + + setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MVT::i16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::i32, MVT::i16, Legal); + + setLoadExtAction(ISD::SEXTLOAD, MVT::i64, MVT::i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, MVT::i16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::i64, MVT::i16, Expand); + + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + } + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -1678,6 +1740,21 @@ ISD::LoadExtType ExtType = Load->getExtensionType(); EVT MemVT = Load->getMemoryVT(); + if (MemVT == MVT::i16) { + assert(Load->getValueType(0) == MVT::i16); + + SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Load->getChain(), + Load->getBasePtr(), MVT::i16, + Load->getMemOperand()); + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, ExtLoad), + ExtLoad.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); // FIXME: Copied from PPC @@ -1942,6 +2019,16 @@ StoreSDNode *Store = cast(Op); EVT VT = Store->getMemoryVT(); + if (VT == MVT::i16) { + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Store->getValue()); + + return DAG.getTruncStore(Store->getChain(), DL, + Ext, + Store->getBasePtr(), + MVT::i16, + Store->getMemOperand()); + } + if (VT == MVT::i1) { return DAG.getTruncStore(Store->getChain(), DL, DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -89,7 +89,7 @@ // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add (sequence "SGPR%u", 0, 103))>; // SGPR 64-bit registers @@ -136,7 +136,8 @@ (add (decimate (shl SGPR_32, 15), 4))]>; // VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +// i16 only on VI+ +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add (sequence "VGPR%u", 0, 255))>; // VGPR 64-bit registers @@ -198,7 +199,7 @@ } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) >; @@ -287,7 +288,7 @@ // VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate //===----------------------------------------------------------------------===// -def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)>; def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { let CopyCost = 2; Index: lib/Target/AMDGPU/VIInstructions.td =================================================================== --- lib/Target/AMDGPU/VIInstructions.td +++ lib/Target/AMDGPU/VIInstructions.td @@ -116,8 +116,111 @@ def : Pat < (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$bound_ctrl, imm:$bank_mask, imm:$row_mask), - (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i1imm $bound_ctrl), - (as_i32imm $bank_mask), (as_i32imm $row_mask)) + (i32 (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i1imm $bound_ctrl), + (as_i32imm $bank_mask), (as_i32imm $row_mask))) >; +//===----------------------------------------------------------------------===// +// i16 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i16 imm:$imm), + (S_MOV_B32 imm:$imm) +>; + +def : Pat< + (i32 (anyext i16:$src)), + (COPY $src) +>; + +// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that +// REG_SEQUENCE patterns don't support instructions with multiple +// outputs. +def : Pat< + (i64 (zext i16:$src)), + (REG_SEQUENCE SReg_64, + (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (i32 0xffff)), SGPR_32)), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; + +def : Pat < + (i64 (sext i16:$src)), + (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0, + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (i32 31)), SGPR_32)), sub1) +>; + +// Same as a 32-bit inreg +def : Pat< + (i32 (sext i16:$src)), + (S_SEXT_I32_I16 $src) +>; + +def : Pat< + (i16 (trunc i32:$src)), + (COPY $src) +>; + +class ZExt_i16_i1_Pat : Pat < + (i16 (ext i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) +>; + +def : ZExt_i16_i1_Pat; +def : ZExt_i16_i1_Pat; + +def : Pat < + (i16 (select i1:$src0, i16:$src1, i16:$src2)), + (V_CNDMASK_B32_e64 $src2, $src1, $src0) +>; + +// Note: 16-bit instructions produce a 0 result in the high 16-bits. +multiclass Arithmetic_i16_Pats { + +def : Pat< + (op i16:$src0, i16:$src1), + (inst i16:$src0, i16:$src1) +>; + +def : Pat< + (i32 (zext (op i16:$src0, i16:$src1))), + (inst i16:$src0, i16:$src1) +>; + +def : Pat< + (i64 (zext (op i16:$src0, i16:$src1))), + (REG_SEQUENCE VReg_64, + (inst i16:$src0, i16:$src1), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; +} + +multiclass Bits_Ops_i16_Pats { + +def : Pat< + (op i16:$src0, i32:$src1), + (inst i16:$src0, i32:$src1) +>; + +def : Pat< + (i32 (zext (op i16:$src0, i32:$src1))), + (inst i16:$src0, i32:$src1) +>; + +def : Pat< + (i64 (zext (op i16:$src0, i32:$src1))), + (REG_SEQUENCE VReg_64, + (inst i16:$src0, i32:$src1), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; +} + +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; + +defm : Bits_Ops_i16_Pats; +defm : Bits_Ops_i16_Pats; +defm : Bits_Ops_i16_Pats; } // End Predicates = [isVI] Index: test/CodeGen/AMDGPU/add.i16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/add.i16.ll @@ -0,0 +1,233 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; GCN-LABEL: {{^}}v_test_add_i16: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %b = load volatile i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_i16_constant: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %add = add i16 %a, 123 + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_i16_neg_constant: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %add = add i16 %a, -845 + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_i16_inline_neg1: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %add = add i16 %a, -1 + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: buffer_store_dword [[ADD]] +define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %b = load volatile i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + %ext = zext i16 %add to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}} +; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}} +define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %b = load volatile i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + %ext = zext i16 %add to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: buffer_store_dword [[SEXT]] +define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load i16, i16 addrspace(1)* %gep.in0 + %b = load i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + %ext = sext i16 %add to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load i16, i16 addrspace(1)* %gep.in0 + %b = load i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + %ext = sext i16 %add to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_i16: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_add_u16_e32 [[RESULT:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: buffer_store_short [[RESULT]] +define void @s_test_add_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #1 { + %add = add i16 %a, %b + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_i16_zeroext_args: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_add_u16_e32 [[RESULT:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: buffer_store_short [[RESULT]] +define void @s_test_add_i16_zeroext_args(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #1 { + %add = add i16 %a, %b + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_i16_signext_args: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_add_u16_e32 [[RESULT:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: buffer_store_short [[RESULT]] +define void @s_test_add_i16_signext_args(i16 addrspace(1)* %out, i16 signext %a, i16 signext %b) #1 { + %add = add i16 %a, %b + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_i16_zext_to_i32: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: buffer_store_dword [[RESULT]] +define void @s_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #1 { + %add = add i16 %a, %b + %ext = zext i16 %add to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_i16_zext_to_i64: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI-DAG: v_add_u16_e32 v[[LO:[0-9]+]], [[B]], [[VA]] +; VI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @s_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #1 { + %add = add i16 %a, %b + %ext = zext i16 %add to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_i16_sext_to_i32: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: v_bfe_i32 [[RESULT:v[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: buffer_store_dword [[RESULT]] +define void @s_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #1 { + %add = add i16 %a, %b + %ext = sext i16 %add to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_i16_sext_to_i64: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_add_u16_e32 v[[LO:[0-9]+]], [[B]], [[VA]] +; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @s_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 signext %a, i16 signext %b) #1 { + %add = add i16 %a, %b + %ext = sext i16 %add to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/anyext.ll =================================================================== --- test/CodeGen/AMDGPU/anyext.ll +++ test/CodeGen/AMDGPU/anyext.ll @@ -1,15 +1,31 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; CHECK-LABEL: {{^}}anyext_i1_i32: -; CHECK: v_cndmask_b32_e64 +; GCN-LABEL: {{^}}anyext_i1_i32: +; GCN: v_cndmask_b32_e64 define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { entry: - %0 = icmp eq i32 %cond, 0 - %1 = zext i1 %0 to i8 - %2 = xor i8 %1, -1 - %3 = and i8 %2, 1 - %4 = zext i8 %3 to i32 - store i32 %4, i32 addrspace(1)* %out + %tmp = icmp eq i32 %cond, 0 + %tmp1 = zext i1 %tmp to i8 + %tmp2 = xor i8 %tmp1, -1 + %tmp3 = and i8 %tmp2, 1 + %tmp4 = zext i8 %tmp3 to i32 + store i32 %tmp4, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_anyext_i16_i32: +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], +; VI: v_not_b32_e32 [[NOT:v[0-9]+]], [[ADD]] +; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[NOT]] +; VI: buffer_store_dword [[AND]] +define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 %a, i16 %b) { +entry: + %tmp = add i16 %a, %b + %tmp1 = trunc i16 %tmp to i8 + %tmp2 = xor i8 %tmp1, -1 + %tmp3 = and i8 %tmp2, 1 + %tmp4 = zext i8 %tmp3 to i32 + store i32 %tmp4, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/global-extload-i8.ll =================================================================== --- test/CodeGen/AMDGPU/global-extload-i8.ll +++ test/CodeGen/AMDGPU/global-extload-i8.ll @@ -150,6 +150,118 @@ ; ret void ; } +; FUNC-LABEL: {{^}}zextload_global_i8_to_i16: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @zextload_global_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = zext i8 %a to i16 + store i16 %ext, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i8_to_i16: +; SI: buffer_load_sbyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @sextload_global_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = sext i8 %a to i16 + store i16 %ext, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i16: +; SI: s_endpgm +define void @zextload_global_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = zext <1 x i8> %load to <1 x i16> + store <1 x i16> %ext, <1 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i16: +; SI: s_endpgm +define void @sextload_global_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = sext <1 x i8> %load to <1 x i16> + store <1 x i16> %ext, <1 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i16: +; SI: s_endpgm +define void @zextload_global_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = zext <2 x i8> %load to <2 x i16> + store <2 x i16> %ext, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i16: +; SI: s_endpgm +define void @sextload_global_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = sext <2 x i8> %load to <2 x i16> + store <2 x i16> %ext, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i16: +; SI: s_endpgm +define void @zextload_global_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = zext <4 x i8> %load to <4 x i16> + store <4 x i16> %ext, <4 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i16: +; SI: s_endpgm +define void @sextload_global_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = sext <4 x i8> %load to <4 x i16> + store <4 x i16> %ext, <4 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i16: +; SI: s_endpgm +define void @zextload_global_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = zext <8 x i8> %load to <8 x i16> + store <8 x i16> %ext, <8 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i16: +; SI: s_endpgm +define void @sextload_global_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = sext <8 x i8> %load to <8 x i16> + store <8 x i16> %ext, <8 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i16: +; SI: s_endpgm +define void @zextload_global_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = zext <16 x i8> %load to <16 x i16> + store <16 x i16> %ext, <16 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i16: +; SI: s_endpgm +define void @sextload_global_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = sext <16 x i8> %load to <16 x i16> + store <16 x i16> %ext, <16 x i16> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}zextload_global_i8_to_i64: ; SI: buffer_load_ubyte v[[LO:[0-9]+]], ; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} Index: test/CodeGen/AMDGPU/max.i16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/max.i16.ll @@ -0,0 +1,168 @@ +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + +; FUNC-LABEL: {{^}}v_test_imax_sge_i16: +; VI: v_max_i16_e32 +define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp sge i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_test_imax_sge_v4i16: +; VI: v_max_i16_e32 +; VI: v_max_i16_e32 +; VI: v_max_i16_e32 +; VI: v_max_i16_e32 +define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid + %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep0, align 4 + %b = load <4 x i16>, <4 x i16> addrspace(1)* %gep1, align 4 + %cmp = icmp sge <4 x i16> %a, %b + %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b + store <4 x i16> %val, <4 x i16> addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imax_sge_i16 +; VI: s_max_i16 +define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp sge i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i16: +; VI: s_max_i16 {{s[0-9]+}}, {{s[0-9]+}}, 9 +define void @s_test_imax_sge_imm_i16(i16 addrspace(1)* %out, i16 %a) nounwind { + %cmp = icmp sge i16 %a, 9 + %val = select i1 %cmp, i16 %a, i16 9 + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i16: +; VI: s_max_i16 {{s[0-9]+}}, {{s[0-9]+}}, 9 +define void @s_test_imax_sgt_imm_i16(i16 addrspace(1)* %out, i16 %a) nounwind { + %cmp = icmp sgt i16 %a, 9 + %val = select i1 %cmp, i16 %a, i16 9 + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_v2i16: +; VI: s_max_i16 {{s[0-9]+}}, {{s[0-9]+}}, 9 +; VI: s_max_i16 {{s[0-9]+}}, {{s[0-9]+}}, 9 +define void @s_test_imax_sgt_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a) nounwind { + %cmp = icmp sgt <2 x i16> %a, + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> + store <2 x i16> %val, <2 x i16> addrspace(1)* %out, align 4 + ret void +} +; FUNC-LABEL: @v_test_imax_sgt_i16 +; VI: v_max_i16_e32 +define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp sgt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imax_sgt_i16 +; VI: s_max_i16 +define void @s_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp sgt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax_uge_i16 +; VI: v_max_u32_e32 +define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp uge i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umax_uge_i16 +; VI: s_max_u32 +define void @s_test_umax_uge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp uge i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_umax_uge_v3i16: +; VI: s_max_u32 +; VI: s_max_u32 +; VI: s_max_u32 +; VI-NOT: s_max_u32 +; VI: s_endpgm +define void @s_test_umax_uge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) nounwind { + %cmp = icmp uge <3 x i16> %a, %b + %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b + store <3 x i16> %val, <3 x i16> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax_ugt_i16 +; VI: v_max_u32_e32 +define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp ugt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_umax_ugt_i16: +; VI: s_max_u32 +define void @s_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp ugt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_umax_ugt_imm_v2i16: +; VI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15 +; VI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23 +define void @s_test_umax_ugt_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a) nounwind { + %cmp = icmp ugt <2 x i16> %a, + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> + store <2 x i16> %val, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + Index: test/CodeGen/AMDGPU/min_test.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/min_test.ll @@ -0,0 +1,188 @@ +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + +; FUNC-LABEL: {{^}}v_test_imin_sle_i16: +; VI: v_min_i16_e32 +define void @v_test_imin_sle_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp sle i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_i16: +; VI: s_min_i16 +define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp sle i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_v1i16: +; VI: s_min_i16 +define void @s_test_imin_sle_v1i16(<1 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { + %cmp = icmp sle <1 x i16> %a, %b + %val = select <1 x i1> %cmp, <1 x i16> %a, <1 x i16> %b + store <1 x i16> %val, <1 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: +; VI: v_min_i16 +; VI: v_min_i16 +; VI: v_min_i16 +; VI: v_min_i16 +define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { + %cmp = icmp sle <4 x i16> %a, %b + %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b + store <4 x i16> %val, <4 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @v_test_imin_slt_i16 +; VI: v_min_i16_e32 +define void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp slt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imin_slt_i16 +; VI: s_min_i16 +define void @s_test_imin_slt_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp slt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_slt_v2i16: +; VI: s_min_i16 +; VI: s_min_i16 +define void @s_test_imin_slt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { + %cmp = icmp slt <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i16: +; VI: s_min_i16 {{s[0-9]+}}, {{s[0-9]+}}, 8 +define void @s_test_imin_slt_imm_i16(i16 addrspace(1)* %out, i16 %a) nounwind { + %cmp = icmp slt i16 %a, 8 + %val = select i1 %cmp, i16 %a, i16 8 + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i16: +; VI: s_min_i16 {{s[0-9]+}}, {{s[0-9]+}}, 8 +define void @s_test_imin_sle_imm_i16(i16 addrspace(1)* %out, i16 %a) nounwind { + %cmp = icmp sle i16 %a, 8 + %val = select i1 %cmp, i16 %a, i16 8 + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_ule_i16 +; VI: v_min_u16_e32 +define void @v_test_umin_ule_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp ule i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_ule_v3i16 +; VI: v_min_u16_e32 +; VI: v_min_u16_e32 +; VI: v_min_u16_e32 +; VI-NOT: v_min_u16_e32 +; VI: s_endpgm +define void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) nounwind { + %cmp = icmp ule <3 x i16> %a, %b + %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b + store <3 x i16> %val, <3 x i16> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umin_ule_i16 +; VI: s_min_u16 +define void @s_test_umin_ule_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp ule i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_ult_i16 +; VI: v_min_u16_e32 +define void @v_test_umin_ult_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp ult i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umin_ult_i16 +; VI: s_min_u16 +define void @s_test_umin_ult_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp ult i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umin_ult_v1i16 +; VI: s_min_u16 +define void @s_test_umin_ult_v1i16(<1 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { + %cmp = icmp ult <1 x i16> %a, %b + %val = select <1 x i1> %cmp, <1 x i16> %a, <1 x i16> %b + store <1 x i16> %val, <1 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: +; VI: s_min_u16 +; VI: s_min_u16 +; VI: s_min_u16 +; VI: s_min_u16 +; VI: s_min_u16 +; VI: s_min_u16 +; VI: s_min_u16 +; VI: s_min_u16 +define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { + %cmp = icmp ult <8 x i16> %a, %b + %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + store <8 x i16> %val, <8 x i16> addrspace(1)* %out + ret void +} + + Index: test/CodeGen/AMDGPU/shl.i16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/shl.i16.ll @@ -0,0 +1,75 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +;VI: {{^}}shl_v2i16: +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 1 + %a = load <2 x i16>, <2 x i16> addrspace(1) * %in + %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr + %result = shl <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +;VI: {{^}}shl_v4i16: +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 1 + %a = load <4 x i16>, <4 x i16> addrspace(1) * %in + %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr + %result = shl <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out + ret void +} + + +;VI: {{^}}shl_i16: +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %a = load i16, i16 addrspace(1) * %in + %b = load i16, i16 addrspace(1) * %b_ptr + %result = shl i16 %a, %b + store i16 %result, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_shl_i16_32_bit_constant: +; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] +; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x12d687{{$}} +; SI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0{{$}} +; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]] +define void @v_shl_i16_32_bit_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) { + %a = load i16, i16 addrspace(1)* %aptr, align 8 + %shl = shl i16 1234567, %a + store i16 %shl, i16 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_shl_inline_imm_8_i16: +; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, 64, {{v[0-9]+}} +define void @v_shl_inline_imm_64_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) { + %a = load i16, i16 addrspace(1)* %aptr, align 8 + %shl = shl i16 8, %a + store i16 %shl, i16 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_shl_inline_imm_1_i16: +; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1, s{{[0-9]+}} +define void @s_shl_inline_imm_1_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 %a) { + %shl = shl i16 1, %a + store i16 %shl, i16 addrspace(1)* %out, align 8 + ret void +} + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/shl.ll =================================================================== --- test/CodeGen/AMDGPU/shl.ll +++ test/CodeGen/AMDGPU/shl.ll @@ -53,6 +53,81 @@ ret void } +;EG-LABEL: {{^}}shl_i16: +;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] +;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} +;EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 +;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]] +;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]|PS}}, {{[[OVERF]]|PV.[XYZW]}} +;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]|PV.[XYZW]}} +;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} +;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 + +;SI: {{^}}shl_i16: +;SI: v_lshl_b16 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI: {{^}}shl_i16: +;VI: v_lshlrev_b16 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %a = load i16, i16 addrspace(1) * %in + %b = load i16, i16 addrspace(1) * %b_ptr + %result = shl i16 %a, %b + store i16 %result, i16 addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_v2i16: +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}shl_v2i16: +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI: {{^}}shl_v2i16: +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 + %a = load <2 x i16>, <2 x i16> addrspace(1) * %in + %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr + %result = shl <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_v4i16: +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}shl_v4i16: +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI: {{^}}shl_v4i16: +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 + %a = load <4 x i16>, <4 x i16> addrspace(1) * %in + %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr + %result = shl <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out + ret void +} + ;EG-LABEL: {{^}}shl_i64: ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} Index: test/CodeGen/AMDGPU/sign_extend.ll =================================================================== --- test/CodeGen/AMDGPU/sign_extend.ll +++ test/CodeGen/AMDGPU/sign_extend.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}s_sext_i1_to_i32: @@ -55,9 +55,19 @@ } ; SI-LABEL: {{^}}s_sext_i16_to_i64: -; SI: s_endpgm +; SI: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000 define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { %sext = sext i16 %a to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 ret void } + +; SI-LABEL: {{^}}s_sext_i1_to_i16: +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 +; SI-NEXT: buffer_store_short [[RESULT]] +define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp eq i32 %a, %b + %sext = sext i1 %cmp to i16 + store i16 %sext, i16 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/sra.i16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sra.i16.ll @@ -0,0 +1,90 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; FUNC-LABEL: {{^}}ashr_v2i16: + +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr + %result = ashr <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ashr_v4i16: + +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 + %a = load <4 x i16>, <4 x i16> addrspace(1)* %in + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr + %result = ashr <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}ashr_i16_2: + +; VI: v_ashrrev_i16 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @ashr_i16_2(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { +entry: + %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %a = load i16, i16 addrspace(1)* %in + %b = load i16, i16 addrspace(1)* %b_ptr + %result = ashr i16 %a, %b + store i16 %result, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_ashr_2_i16: +; SI: buffer_load_dword v[[HI:[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; VI: flat_load_dword v[[HI:[0-9]+]] +; GCN: v_ashrrev_i16_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]] +; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[HI]]:[[SHIFT]]{{\]}} +define void @v_ashr_32_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep.in + %result = ashr i16 %a, 2 + store i16 %result, i16 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}s_ashr_4_i16: +; GCN-DAG: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN: s_ashr_i16 s[[SHIFT:[0-9]+]], s[[HI]], 31 +; GCN: s_mov_b32 s[[COPYSHIFT:[0-9]+]], s[[SHIFT]] +; GCN: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}} +; GCN: s_addc_u32 {{s[0-9]+}}, s[[COPYSHIFT]], {{s[0-9]+}} +define void @s_ashr_63_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) { + %result = ashr i16 %a, 4 + %add = add i16 %result, %b + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_ashr_8_i16: +; VI: flat_load_dword v[[HI:[0-9]+]] +define void @v_ashr_63_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep.in + %result = ashr i16 %a, 8 + store i16 %result, i16 addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -46,6 +46,48 @@ ret void } +; FUNC-LABEL: {{^}}ashr_v2i16: +; SI: v_ashr_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_ashr_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr + %result = ashr <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ashr_v4i16: +; SI: v_ashr_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_ashr_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_ashr_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_ashr_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 + %a = load <4 x i16>, <4 x i16> addrspace(1)* %in + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr + %result = ashr <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}s_ashr_i64: ; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 Index: test/CodeGen/AMDGPU/sub.i16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sub.i16.ll @@ -0,0 +1,233 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; GCN-LABEL: {{^}}v_test_sub_i16: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %b = load volatile i16, i16 addrspace(1)* %gep.in1 + %sub = sub i16 %a, %b + store i16 %sub, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_i16_constant: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %sub = sub i16 %a, 123 + store i16 %sub, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_i16_neg_constant: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %sub = sub i16 %a, -845 + store i16 %sub, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_i16_inline_neg1: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], -1, [[A]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_sub_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %sub = sub i16 %a, -1 + store i16 %sub, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i32: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: buffer_store_dword [[ADD]] +define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %b = load volatile i16, i16 addrspace(1)* %gep.in1 + %sub = sub i16 %a, %b + %ext = zext i16 %sub to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}} +; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}} +define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %b = load volatile i16, i16 addrspace(1)* %gep.in1 + %sub = sub i16 %a, %b + %ext = zext i16 %sub to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: buffer_store_dword [[SEXT]] +define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load i16, i16 addrspace(1)* %gep.in0 + %b = load i16, i16 addrspace(1)* %gep.in1 + %sub = sub i16 %a, %b + %ext = sext i16 %sub to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load i16, i16 addrspace(1)* %gep.in0 + %b = load i16, i16 addrspace(1)* %gep.in1 + %sub = sub i16 %a, %b + %ext = sext i16 %sub to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_i16: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_sub_u16_e32 [[RESULT:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: buffer_store_short [[RESULT]] +define void @s_test_sub_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #1 { + %sub = sub i16 %a, %b + store i16 %sub, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_i16_zeroext_args: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_sub_u16_e32 [[RESULT:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: buffer_store_short [[RESULT]] +define void @s_test_sub_i16_zeroext_args(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #1 { + %sub = sub i16 %a, %b + store i16 %sub, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_i16_signext_args: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_sub_u16_e32 [[RESULT:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: buffer_store_short [[RESULT]] +define void @s_test_sub_i16_signext_args(i16 addrspace(1)* %out, i16 signext %a, i16 signext %b) #1 { + %sub = sub i16 %a, %b + store i16 %sub, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_i16_zext_to_i32: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: buffer_store_dword [[RESULT]] +define void @s_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #1 { + %sub = sub i16 %a, %b + %ext = zext i16 %sub to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_i16_zext_to_i64: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI-DAG: v_sub_u16_e32 v[[LO:[0-9]+]], [[B]], [[VA]] +; VI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @s_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #1 { + %sub = sub i16 %a, %b + %ext = zext i16 %sub to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_i16_sext_to_i32: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[VA]] +; VI-NEXT: v_bfe_i32 [[RESULT:v[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: buffer_store_dword [[RESULT]] +define void @s_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #1 { + %sub = sub i16 %a, %b + %ext = sext i16 %sub to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_i16_sext_to_i64: +; VI-DAG: s_load_dword [[A:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s[0:1], 0x30 +; VI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; VI: v_sub_u16_e32 v[[LO:[0-9]+]], [[B]], [[VA]] +; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @s_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 signext %a, i16 signext %b) #1 { + %sub = sub i16 %a, %b + %ext = sext i16 %sub to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -54,6 +54,52 @@ ret void } +; SI: v_subrev_i16_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %a = load i16, i16 addrspace(1)* %in + %b = load i16, i16 addrspace(1)* %b_ptr + %result = sub i16 %a, %b + store i16 %result, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_sub_v2i16: +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_sub_i16_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_sub_i16_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} + +define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 + %a = load <2 x i16>, <2 x i16> addrspace(1) * %in + %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr + %result = sub <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_sub_v4i16: +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_sub_i16_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_sub_i16_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_sub_i16_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +; SI: v_sub_i16_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} + +define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 + %a = load <4 x i16>, <4 x i16> addrspace(1) * %in + %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr + %result = sub <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}s_sub_i64: ; SI: s_sub_u32 ; SI: s_subb_u32 Index: test/CodeGen/AMDGPU/trunc-store-i1.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store-i1.ll +++ test/CodeGen/AMDGPU/trunc-store-i1.ll @@ -21,13 +21,20 @@ ret void } -; SI-LABEL: {{^}}global_truncstore_i16_to_i1: +; SI-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1: ; SI: s_load_dword [[LOAD:s[0-9]+]], ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] ; SI: buffer_store_byte [[VREG]], -define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { +define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { %trunc = trunc i16 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void } +; SI-LABEL: {{^}}global_truncstore_i16_to_i1: +define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind { + %add = add i16 %val0, %val1 + %trunc = trunc i16 %add to i1 + store i1 %trunc, i1 addrspace(1)* %out, align 1 + ret void +} Index: test/CodeGen/AMDGPU/zero_extend.ll =================================================================== --- test/CodeGen/AMDGPU/zero_extend.ll +++ test/CodeGen/AMDGPU/zero_extend.ll @@ -2,39 +2,58 @@ ; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI -; R600: {{^}}test: +; R600: {{^}}s_mad_zext_i32_to_i64: ; R600: MEM_RAT_CACHELESS STORE_RAW ; R600: MEM_RAT_CACHELESS STORE_RAW ; SI: {{^}}test: ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}} ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} -define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 { entry: - %0 = mul i32 %a, %b - %1 = add i32 %0, %c - %2 = zext i32 %1 to i64 - store i64 %2, i64 addrspace(1)* %out + %tmp0 = mul i32 %a, %b + %tmp1 = add i32 %tmp0, %c + %tmp2 = zext i32 %tmp1 to i64 + store i64 %tmp2, i64 addrspace(1)* %out ret void } -; SI-LABEL: {{^}}testi1toi32: +; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32 ; SI: v_cndmask_b32 -define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: - %0 = icmp eq i32 %a, %b - %1 = zext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out + %tmp0 = icmp eq i32 %a, %b + %tmp1 = zext i1 %tmp0 to i32 + store i32 %tmp1, i32 addrspace(1)* %out ret void } -; SI-LABEL: {{^}}zext_i1_to_i64: +; SI-LABEL: {{^}}s_arg_zext_i1_to_i64: +define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 { + %ext = zext i1 %arg to i64 + store i64 %ext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_cmp_zext_i1_to_i64: ; SI: s_mov_b32 s{{[0-9]+}}, 0 ; SI: v_cmp_eq_i32 ; SI: v_cndmask_b32 -define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp eq i32 %a, %b %ext = zext i1 %cmp to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void } + +; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16 +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; SI-NEXT: buffer_store_short [[RESULT]] +define void @s_test_zext_i16_to_i32(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { + %tmp0 = icmp eq i16 %a, %b + %tmp1 = zext i1 %tmp0 to i16 + store i16 %tmp1, i16 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind }