Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -480,6 +480,8 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; +def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">; + class PredicateControl { Predicate SubtargetPredicate; Predicate SIAssemblerPredicate = isSICI; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -583,19 +583,32 @@ bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. - return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); + + unsigned SrcSize = Source.getSizeInBits(); + unsigned DestSize = Dest.getSizeInBits(); + + return DestSize < SrcSize && DestSize % 32 == 0 ; } bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { // Truncate is just accessing a subregister. - return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && - (Dest->getPrimitiveSizeInBits() % 32 == 0); + + unsigned SrcSize = Source->getScalarSizeInBits(); + unsigned DestSize = Dest->getScalarSizeInBits(); + + if (DestSize== 16 && Subtarget->has16BitInsts()) + return SrcSize >= 32; + + return DestSize < SrcSize && DestSize % 32 == 0; } bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { unsigned SrcSize = Src->getScalarSizeInBits(); unsigned DestSize = Dest->getScalarSizeInBits(); + if (SrcSize == 16 && Subtarget->has16BitInsts()) + return DestSize >= 32; + return SrcSize == 32 && DestSize == 64; } @@ -604,6 +617,10 @@ // practical purposes, the extra mov 0 to load a 64-bit is free. As used, // this will enable reducing 64-bit operations the 32-bit, which is always // good. + + if (Src == MVT::i16) + return Dest == MVT::i32 ||Dest == MVT::i64 ; + return Src == MVT::i32 && Dest == MVT::i64; } @@ -2347,6 +2364,10 @@ if (VT.isVector() || Size > 64) return SDValue(); + // There are i16 integer mul/mad. + if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -529,14 +529,14 @@ def : Pat < (fcopysign f32:$src0, f32:$src1), - (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) + (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1) >; def : Pat < (f64 (fcopysign f64:$src0, f64:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, - (BFI_INT (LoadImm32 0x7fffffff), + (BFI_INT (LoadImm32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) >; @@ -545,7 +545,7 @@ (f64 (fcopysign f64:$src0, f32:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, - (BFI_INT (LoadImm32 0x7fffffff), + (BFI_INT (LoadImm32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), $src1), sub1) >; Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -710,13 +710,13 @@ // int_SI_vs_load_input def : Pat< (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) + (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0) >; // Offset in an 32-bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) + (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0) >; @@ -916,7 +916,7 @@ >; -class MUBUFLoad_Pattern : Pat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), @@ -938,15 +938,34 @@ } let Predicates = [isSICI] in { -def : MUBUFLoad_Pattern ; -def : MUBUFLoad_Pattern ; -def : MUBUFLoad_Pattern ; -def : MUBUFLoad_Pattern ; +def : MUBUFLoad_PatternADDR64 ; +def : MUBUFLoad_PatternADDR64 ; +def : MUBUFLoad_PatternADDR64 ; +def : MUBUFLoad_PatternADDR64 ; defm : MUBUFLoad_Atomic_Pattern ; defm : MUBUFLoad_Atomic_Pattern ; } // End Predicates = [isSICI] +multiclass MUBUFLoad_Pattern { + + def : Pat < + (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), + (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; +} + +let Predicates = [Has16BitInsts] in { + +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; + +} // End Predicates = [Has16BitInsts] + class MUBUFScratchLoadPat : Pat < (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), @@ -955,6 +974,8 @@ def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; @@ -1027,6 +1048,20 @@ defm : MUBUFStore_Atomic_Pattern ; } // End Predicates = [isSICI] + +multiclass MUBUFStore_Pattern { + + def : Pat < + (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe)), + (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; +} + +defm : MUBUFStore_Pattern ; +defm : MUBUFStore_Pattern ; + class MUBUFScratchStorePat : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), @@ -1035,6 +1070,8 @@ def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -489,8 +489,12 @@ def : DSReadPat ; def : DSReadPat ; +def : DSReadPat ; +def : DSReadPat ; +def : DSReadPat ; def : DSReadPat ; def : DSReadPat ; +def : DSReadPat ; def : DSReadPat ; let AddedComplexity = 100 in { @@ -512,6 +516,8 @@ def : DSWritePat ; def : DSWritePat ; +def : DSWritePat ; +def : DSWritePat ; def : DSWritePat ; let AddedComplexity = 100 in { @@ -522,8 +528,8 @@ def : Pat < (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), - (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), - (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)), + (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1, (i1 0)) >; Index: lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- lib/Target/AMDGPU/FLATInstructions.td +++ lib/Target/AMDGPU/FLATInstructions.td @@ -341,6 +341,8 @@ def : FlatLoadPat ; def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; @@ -389,6 +391,10 @@ } // End Predicates = [isCIVI] +let Predicates = [isVI] in { + def : FlatStorePat ; + def : FlatStorePat ; +} //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -78,6 +78,9 @@ addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + if (Subtarget->has16BitInsts()) + addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + computeRegisterProperties(STI.getRegisterInfo()); // We need to custom lower vector stores from local memory @@ -221,6 +224,55 @@ setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::Constant, MVT::i16, Legal); + + setOperationAction(ISD::SMIN, MVT::i16, Legal); + setOperationAction(ISD::SMAX, MVT::i16, Legal); + + setOperationAction(ISD::UMIN, MVT::i16, Legal); + setOperationAction(ISD::UMAX, MVT::i16, Legal); + + setOperationAction(ISD::SETCC, MVT::i16, Promote); + AddPromotedToType(ISD::SETCC, MVT::i16, MVT::i32); + + setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); + AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); + + setOperationAction(ISD::ROTR, MVT::i16, Promote); + setOperationAction(ISD::ROTL, MVT::i16, Promote); + + setOperationAction(ISD::SDIV, MVT::i16, Promote); + setOperationAction(ISD::UDIV, MVT::i16, Promote); + setOperationAction(ISD::SREM, MVT::i16, Promote); + setOperationAction(ISD::UREM, MVT::i16, Promote); + + setOperationAction(ISD::BSWAP, MVT::i16, Promote); + setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); + + setOperationAction(ISD::CTTZ, MVT::i16, Promote); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); + setOperationAction(ISD::CTLZ, MVT::i16, Promote); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); + + setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); + + setOperationAction(ISD::BR_CC, MVT::i16, Expand); + + setOperationAction(ISD::LOAD, MVT::i16, Custom); + + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); + AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32); + setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); + AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); + setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); + AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); + } + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -2558,7 +2610,6 @@ EVT MemVT = Load->getMemoryVT(); if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { - assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); // FIXME: Copied from PPC // First, load into 32 bits, then truncate to 1 bit. @@ -2566,8 +2617,10 @@ SDValue BasePtr = Load->getBasePtr(); MachineMemOperand *MMO = Load->getMemOperand(); + EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, - BasePtr, MVT::i8, MMO); + BasePtr, RealMemVT, MMO); SDValue Ops[] = { DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), @@ -3381,8 +3434,23 @@ } EVT VT = K0->getValueType(0); - return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + + MVT NVT = MVT::i32; + unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + + SDValue Tmp1, Tmp2, Tmp3; + Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); + + if (VT == MVT::i16) { + Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, + Tmp1, Tmp2, Tmp3); + + return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); + } else + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); } static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1122,7 +1122,6 @@ include "SIInstructions.td" include "CIInstructions.td" -include "VIInstructions.td" include "DSInstructions.td" include "MIMGInstructions.td" Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -374,7 +374,7 @@ def : Pat < (int_AMDGPU_kilp), - (SI_KILL 0xbf800000) + (SI_KILL (i32 0xbf800000)) >; def : Pat < @@ -555,7 +555,7 @@ def : Pat < (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) + (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod) >; /********** ================================ **********/ @@ -566,7 +566,7 @@ def : Pat < (fneg (fabs f32:$src)), - (S_OR_B32 $src, (S_MOV_B32 0x80000000)) // Set sign bit + (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit >; // FIXME: Should use S_OR_B32 @@ -575,19 +575,19 @@ (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), // Set sign bit. + (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. sub1) >; def : Pat < (fabs f32:$src), - (V_AND_B32_e64 $src, (V_MOV_B32_e32 0x7fffffff)) + (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff))) >; def : Pat < (fneg f32:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000))) >; def : Pat < @@ -595,8 +595,8 @@ (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_AND_B32_e64 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. + (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit. sub1) >; @@ -605,8 +605,8 @@ (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), + (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (i32 (V_MOV_B32_e32 (i32 0x80000000)))), sub1) >; @@ -666,21 +666,21 @@ def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, - (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), + (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub0, - (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), + (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub1, - (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub2, - (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub3) >; @@ -701,7 +701,7 @@ def : Pat < (AMDGPUurecip i32:$src0), (V_CVT_U32_F32_e32 - (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, + (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1), (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; @@ -767,32 +767,37 @@ //===----------------------------------------------------------------------===// def : Pat<(i32 (sext_inreg i32:$src, i1)), - (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 + (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 // Handle sext_inreg in i64 def : Pat < (i64 (sext_inreg i64:$src, i1)), - (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 + (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 +>; + +def : Pat < + (i16 (sext_inreg i16:$src, i8)), + (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i8)), - (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 + (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i16)), - (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 + (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i32)), - (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 + (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 >; def : Pat < (i64 (zext i32:$src)), - (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) + (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) >; def : Pat < @@ -804,7 +809,7 @@ (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, - (S_MOV_B32 0), sub1) + (S_MOV_B32 (i32 0)), sub1) >; @@ -816,25 +821,25 @@ def : Pat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, - (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1) + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) >; def : Pat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 0, -1, $src), sub0, - (V_CNDMASK_B32_e64 0, -1, $src), sub1) + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0, + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1) >; -class FPToI1Pat : Pat < +class FPToI1Pat : Pat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), - (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) + (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) >; -def : FPToI1Pat; -def : FPToI1Pat; -def : FPToI1Pat; -def : FPToI1Pat; +def : FPToI1Pat; +def : FPToI1Pat; +def : FPToI1Pat; +def : FPToI1Pat; // If we need to perform a logical operation on i1 values, we need to // use vector comparisons since there is only one SCC register. Vector @@ -859,12 +864,12 @@ def : Pat < (f32 (sint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) + (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src) >; def : Pat < (f32 (uint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) + (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src) >; def : Pat < @@ -888,20 +893,20 @@ def : Pat < (i1 (trunc i32:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), 1) + (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) >; def : Pat < (i1 (trunc i64:$a)), (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), - (EXTRACT_SUBREG $a, sub0)), 1) + (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; def : Pat < (i32 (bswap i32:$a)), - (V_BFI_B32 (S_MOV_B32 0x00ff00ff), - (V_ALIGNBIT_B32 $a, $a, 24), - (V_ALIGNBIT_B32 $a, $a, 8)) + (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32 $a, $a, (i32 24)), + (V_ALIGNBIT_B32 $a, $a, (i32 8))) >; def : Pat < @@ -917,7 +922,7 @@ def : Pat < (vt (add (vt (shl 1, vt:$a)), -1)), - (BFM $a, (MOV 0)) + (BFM $a, (MOV (i32 0))) >; } @@ -928,7 +933,7 @@ def : Pat< (fcanonicalize f32:$src), - (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0) + (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0) >; def : Pat< @@ -963,7 +968,7 @@ (V_MOV_B64_PSEUDO 0x3fefffffffffffff), DSTCLAMP.NONE, DSTOMOD.NONE), $x, - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))), DSTCLAMP.NONE, DSTOMOD.NONE) >; Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -123,7 +123,7 @@ // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add (sequence "SGPR%u", 0, 103))> { let AllocationPriority = 1; } @@ -190,7 +190,8 @@ (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +// i16 only on VI+ +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; @@ -258,8 +259,8 @@ } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SReg_32_XM0, M0)> { +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, + (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> { let AllocationPriority = 1; } @@ -346,7 +347,7 @@ let Size = 32; } -def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)> { +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> { let isAllocatable = 0; } Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -879,7 +879,7 @@ (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, - (S_MOV_B32 0), sub1)) + (S_MOV_B32 (i32 0)), sub1)) >; def : Pat < @@ -887,6 +887,18 @@ (S_ABS_I32 $x) >; +def : Pat < + (i16 imm:$imm), + (S_MOV_B32 imm:$imm) +>; + +// Same as a 32-bit inreg +def : Pat< + (i32 (sext i16:$src)), + (S_SEXT_I32_I16 $src) +>; + + //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// @@ -898,6 +910,29 @@ (S_ADD_U32 $src0, $src1) >; +// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that +// REG_SEQUENCE patterns don't support instructions with multiple +// outputs. +def : Pat< + (i64 (zext i16:$src)), + (REG_SEQUENCE SReg_64, + (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; + +def : Pat < + (i64 (sext i16:$src)), + (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0, + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1) +>; + +def : Pat< + (i32 (zext i16:$src)), + (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src) +>; + + + //===----------------------------------------------------------------------===// // SOPP Patterns //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/VIInstructions.td =================================================================== --- lib/Target/AMDGPU/VIInstructions.td +++ /dev/null @@ -1,10 +0,0 @@ -//===-- VIInstructions.td - VI Instruction Defintions ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Instruction definitions for VI and newer. -//===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -301,6 +301,20 @@ } +let Predicates = [isVI] in { + +def : Pat< + (f32 (f16_to_fp i16:$src)), + (V_CVT_F32_F16_e32 $src) +>; + +def : Pat< + (i16 (fp_to_f16 f32:$src)), + (V_CVT_F16_F32_e32 $src) +>; + +} + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -561,10 +575,39 @@ let Predicates = [isVI] in { def : Pat < - (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, - imm:$bound_ctrl), + (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, + imm:$bound_ctrl)), (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) >; + +def : Pat< + (i32 (anyext i16:$src)), + (COPY $src) +>; + +def : Pat< + (i64 (anyext i16:$src)), + (REG_SEQUENCE VReg_64, + (i32 (COPY $src)), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; + +def : Pat< + (i16 (trunc i32:$src)), + (COPY $src) +>; + +def : Pat< + (i1 (trunc i16:$src)), + (COPY $src) +>; + + +def : Pat < + (i16 (trunc i64:$src)), + (EXTRACT_SUBREG $src, sub0) +>; + } // End Predicates = [isVI] Index: lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP2Instructions.td +++ lib/Target/AMDGPU/VOP2Instructions.td @@ -345,6 +345,78 @@ } // End SubtargetPredicate = isVI +// Note: 16-bit instructions produce a 0 result in the high 16-bits. +multiclass Arithmetic_i16_Pats { + +def : Pat< + (op i16:$src0, i16:$src1), + (inst $src0, $src1) +>; + +def : Pat< + (i32 (zext (op i16:$src0, i16:$src1))), + (inst $src0, $src1) +>; + +def : Pat< + (i64 (zext (op i16:$src0, i16:$src1))), + (REG_SEQUENCE VReg_64, + (inst $src0, $src1), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; + +} + +multiclass Bits_OpsRev_i16_Pats { + +def : Pat< + (op i16:$src0, i32:$src1), + (inst $src1, $src0) +>; + +def : Pat< + (i32 (zext (op i16:$src0, i32:$src1))), + (inst $src1, $src0) +>; + + +def : Pat< + (i64 (zext (op i16:$src0, i32:$src1))), + (REG_SEQUENCE VReg_64, + (inst $src1, $src0), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; +} + +class ZExt_i16_i1_Pat : Pat < + (i16 (ext i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) +>; + +let Predicates = [isVI] in { + +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; + +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; +defm : Arithmetic_i16_Pats; + +defm : Bits_OpsRev_i16_Pats; +defm : Bits_OpsRev_i16_Pats; +defm : Bits_OpsRev_i16_Pats; + +def : ZExt_i16_i1_Pat; +def : ZExt_i16_i1_Pat; +def : ZExt_i16_i1_Pat; + +} // End Predicates = [isVI] + //===----------------------------------------------------------------------===// // SI //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -222,6 +222,38 @@ } // End SubtargetPredicate = isVI +def : Pat < + (i16 (select i1:$src0, i16:$src1, i16:$src2)), + (V_CNDMASK_B32_e64 $src2, $src1, $src0) +>; + +let Predicates = [isVI] in { + +multiclass Tenary_i16_Pats { +def : Pat< + (op2 (op1 i16:$src0, i16:$src1), i16:$src2), + (inst i16:$src0, i16:$src1, i16:$src2) +>; + +def : Pat< + (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), + (inst i16:$src0, i16:$src1, i16:$src2) +>; + +def : Pat< + (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), + (REG_SEQUENCE VReg_64, + (inst i16:$src0, i16:$src1, i16:$src2), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; +} + +defm: Tenary_i16_Pats; +defm: Tenary_i16_Pats; + +} // End Predicates = [isVI] + //===----------------------------------------------------------------------===// // Target Index: test/CodeGen/AMDGPU/add.i16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/add.i16.ll @@ -0,0 +1,149 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_i16: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %b = load volatile i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_i16_constant: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %add = add i16 %a, 123 + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_i16_neg_constant: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %add = add i16 %a, -845 + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_i16_inline_neg1: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]] +; VI-NEXT: buffer_store_short [[ADD]] +define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %add = add i16 %a, -1 + store i16 %add, i16 addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: buffer_store_dword [[ADD]] +define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %b = load volatile i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + %ext = zext i16 %add to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 +; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} +define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load volatile i16, i16 addrspace(1)* %gep.in0 + %b = load volatile i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + %ext = zext i16 %add to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: buffer_store_dword [[SEXT]] +define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load i16, i16 addrspace(1)* %gep.in0 + %b = load i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + %ext = sext i16 %add to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64: +; VI: flat_load_ushort [[A:v[0-9]+]] +; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 +; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid + %a = load i16, i16 addrspace(1)* %gep.in0 + %b = load i16, i16 addrspace(1)* %gep.in1 + %add = add i16 %a, %b + %ext = sext i16 %add to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/anyext.ll =================================================================== --- test/CodeGen/AMDGPU/anyext.ll +++ test/CodeGen/AMDGPU/anyext.ll @@ -1,15 +1,40 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; CHECK-LABEL: {{^}}anyext_i1_i32: -; CHECK: v_cndmask_b32_e64 +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone + +; GCN-LABEL: {{^}}anyext_i1_i32: +; GCN: v_cndmask_b32_e64 define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { entry: - %0 = icmp eq i32 %cond, 0 - %1 = zext i1 %0 to i8 - %2 = xor i8 %1, -1 - %3 = and i8 %2, 1 - %4 = zext i8 %3 to i32 - store i32 %4, i32 addrspace(1)* %out + %tmp = icmp eq i32 %cond, 0 + %tmp1 = zext i1 %tmp to i8 + %tmp2 = xor i8 %tmp1, -1 + %tmp3 = and i8 %tmp2, 1 + %tmp4 = zext i8 %tmp3 to i32 + store i32 %tmp4, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_anyext_i16_i32: +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], +; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]] +; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]] +; VI: buffer_store_dword [[AND]] +define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) { +entry: + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %tid.y = call i32 @llvm.amdgcn.workitem.id.y() + %a.ptr = getelementptr i16, i16 addrspace(1)* %a, i32 %tid.x + %b.ptr = getelementptr i16, i16 addrspace(1)* %b, i32 %tid.y + %a.l = load i16, i16 addrspace(1)* %a.ptr + %b.l = load i16, i16 addrspace(1)* %b.ptr + %tmp = add i16 %a.l, %b.l + %tmp1 = trunc i16 %tmp to i8 + %tmp2 = xor i8 %tmp1, -1 + %tmp3 = and i8 %tmp2, 1 + %tmp4 = zext i8 %tmp3 to i32 + store i32 %tmp4, i32 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/bitreverse.ll =================================================================== --- test/CodeGen/AMDGPU/bitreverse.ll +++ test/CodeGen/AMDGPU/bitreverse.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s declare i16 @llvm.bitreverse.i16(i16) #1 declare i32 @llvm.bitreverse.i32(i32) #1 @@ -12,7 +13,7 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 ; FUNC-LABEL: {{^}}s_brev_i16: -; SI: s_brev_b32 +; SI: s_brev_b32 define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 store i16 %brev, i16 addrspace(1)* %out Index: test/CodeGen/AMDGPU/cgp-bitfield-extract.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -116,14 +116,19 @@ ; OPT: store ; OPT: ret +; For GFX8: since i16 is legal type, we cannot sink lshr into BBs. ; GCN-LABEL: {{^}}sink_ubfe_i16: ; GCN-NOT: lshr +; VI: s_bfe_u32 s0, s0, 0xc0004 ; GCN: s_cbranch_vccnz -; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 +; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004 +; VI: s_and_b32 s0, s0, 0xff + ; GCN: BB2_2: -; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 +; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004 +; VI: s_and_b32 s0, s0, 0x7f ; GCN: BB2_3: ; GCN: buffer_store_short Index: test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- test/CodeGen/AMDGPU/copy-illegal-type.ll +++ test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -1,10 +1,13 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; FUNC-LABEL: {{^}}test_copy_v4i8: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm +; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: buffer_store_dword [[REG]] +; GCN: s_endpgm define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 @@ -12,10 +15,10 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_x2: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm +; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: buffer_store_dword [[REG]] +; GCN: buffer_store_dword [[REG]] +; GCN: s_endpgm define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 @@ -24,11 +27,11 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_x3: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm +; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: buffer_store_dword [[REG]] +; GCN: buffer_store_dword [[REG]] +; GCN: buffer_store_dword [[REG]] +; GCN: s_endpgm define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 @@ -38,12 +41,12 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_x4: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm +; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: buffer_store_dword [[REG]] +; GCN: buffer_store_dword [[REG]] +; GCN: buffer_store_dword [[REG]] +; GCN: buffer_store_dword [[REG]] +; GCN: s_endpgm define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 @@ -54,14 +57,14 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; SI: buffer_load_dword -; SI-DAG: v_lshrrev_b32 -; SI: v_and_b32 -; SI: v_or_b32 -; SI-DAG: buffer_store_dword -; SI-DAG: buffer_store_dword +; GCN: buffer_load_dword +; GCN-DAG: v_lshrrev_b32 +; GCN: v_and_b32 +; GCN: v_or_b32 +; GCN-DAG: buffer_store_dword +; GCN-DAG: buffer_store_dword -; SI: s_endpgm +; GCN: s_endpgm define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %add = add <4 x i8> %val, @@ -70,18 +73,22 @@ ret void } +; FIXME: Need to handle non-uniform case for function below (load without gep). ; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use: -; SI: buffer_load_dword -; SI-DAG: v_lshrrev_b32 +; GCN: {{buffer|flat}}_load_dword +; GCN-DAG: v_lshrrev_b32 ; SI-DAG: v_add_i32 -; SI-DAG: v_and_b32 -; SI-DAG: v_or_b32 -; SI-DAG: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: s_endpgm +; VI-DAG: v_add_u16 +; GCN-DAG: v_and_b32 +; GCN-DAG: v_or_b32 +; GCN-DAG: {{buffer|flat}}_store_dword +; GCN: {{buffer|flat}}_store_dword +; GCN: {{buffer|flat}}_store_dword +; GCN: s_endpgm define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 %add = add <4 x i8> %val, store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 @@ -90,10 +97,10 @@ } ; FUNC-LABEL: {{^}}test_copy_v3i8_align4: -; SI: buffer_load_dword -; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; SI: s_endpgm +; GCN: buffer_load_dword +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN: s_endpgm define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 @@ -101,11 +108,11 @@ } ; FUNC-LABEL: {{^}}test_copy_v3i8_align2: -; SI-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; SI-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; SI: s_endpgm +; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN: s_endpgm define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2 @@ -113,14 +120,14 @@ } ; FUNC-LABEL: {{^}}test_copy_v3i8_align1: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: s_endpgm +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: s_endpgm define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 @@ -128,12 +135,12 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_store_dword +; GCN: s_endpgm define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 @@ -141,15 +148,15 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: s_endpgm +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: s_endpgm define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz.ll +++ test/CodeGen/AMDGPU/ctlz.ll @@ -100,6 +100,7 @@ ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], ; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_byte [[RESULT]], +; GCN: s_endpgm define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone Index: test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone Index: test/CodeGen/AMDGPU/cube.ll =================================================================== --- test/CodeGen/AMDGPU/cube.ll +++ test/CodeGen/AMDGPU/cube.ll @@ -30,10 +30,10 @@ } ; GCN-LABEL: {{^}}legacy_cube: -; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} ; GCN: buffer_store_dwordx4 define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 { %cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx) Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1,15 +1,15 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone -; SI-LABEL: {{^}}load_i8_to_f32: -; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]], -; SI-NOT: bfe -; SI-NOT: lshr -; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] -; SI: buffer_store_dword [[CONV]], +; GCN-LABEL: {{^}}load_i8_to_f32: +; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]], +; GCN-NOT: bfe +; GCN-NOT: lshr +; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] +; GCN: buffer_store_dword [[CONV]], define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %load = load i8, i8 addrspace(1)* %in, align 1 %cvt = uitofp i8 %load to float @@ -17,11 +17,11 @@ ret void } -; SI-LABEL: {{^}}load_v2i8_to_v2f32: -; SI: buffer_load_ushort [[LD:v[0-9]+]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] -; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; GCN-LABEL: {{^}}load_v2i8_to_v2f32: +; GCN: buffer_load_ushort [[LD:v[0-9]+]] +; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] +; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] +; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 %cvt = uitofp <2 x i8> %load to <2 x float> @@ -29,13 +29,13 @@ ret void } -; SI-LABEL: {{^}}load_v3i8_to_v3f32: -; SI: buffer_load_dword [[VAL:v[0-9]+]] -; SI-NOT: v_cvt_f32_ubyte3_e32 -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] -; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; GCN-LABEL: {{^}}load_v3i8_to_v3f32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN-NOT: v_cvt_f32_ubyte3_e32 +; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] +; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] +; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> @@ -43,15 +43,15 @@ ret void } -; SI-LABEL: {{^}}load_v4i8_to_v4f32: -; SI: buffer_load_dword [[LOADREG:v[0-9]+]] -; SI-NOT: bfe -; SI-NOT: lshr -; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] -; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; GCN-LABEL: {{^}}load_v4i8_to_v4f32: +; GCN: buffer_load_dword [[LOADREG:v[0-9]+]] +; GCN-NOT: bfe +; GCN-NOT: lshr +; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] +; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]] +; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] +; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 %cvt = uitofp <4 x i8> %load to <4 x float> @@ -63,19 +63,19 @@ ; position in the word for the component. ; FIXME: Packing bytes -; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: -; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]] -; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]] -; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]] -; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]] -; SI-DAG: v_lshlrev_b32 -; SI-DAG: v_or_b32 -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]] - -; SI: buffer_store_dwordx4 +; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: +; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]] +; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]] +; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]] +; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]] +; GCN-DAG: v_lshlrev_b32 +; GCN-DAG: v_or_b32 +; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], +; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, +; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, +; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]] + +; GCN: buffer_store_dwordx4 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> @@ -85,25 +85,31 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; Instructions still emitted to repack bytes for add use. -; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: -; SI: {{buffer|flat}}_load_dword -; SI-DAG: v_cvt_f32_ubyte0_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 -; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte3_e32 - -; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 -; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16 + +; GCN-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: +; GCN: {{buffer|flat}}_load_dword +; GCN-DAG: v_cvt_f32_ubyte0_e32 +; GCN-DAG: v_cvt_f32_ubyte1_e32 +; GCN-DAG: v_cvt_f32_ubyte2_e32 +; GCN-DAG: v_cvt_f32_ubyte3_e32 + +; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 +; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16 + ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8 ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00, ; SI-DAG: v_add_i32 -; SI: {{buffer|flat}}_store_dwordx4 -; SI: {{buffer|flat}}_store_dword +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffffff00, +; VI-DAG: v_add_u16_e32 +; VI-DAG: v_add_u16_e32 + +; GCN: {{buffer|flat}}_store_dwordx4 +; GCN: {{buffer|flat}}_store_dword -; SI: s_endpgm +; GCN: s_endpgm define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -116,8 +122,8 @@ } ; Make sure this doesn't crash. -; SI-LABEL: {{^}}load_v7i8_to_v7f32: -; SI: s_endpgm +; GCN-LABEL: {{^}}load_v7i8_to_v7f32: +; GCN: s_endpgm define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 %cvt = uitofp <7 x i8> %load to <7 x float> @@ -125,22 +131,22 @@ ret void } -; SI-LABEL: {{^}}load_v8i8_to_v8f32: -; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, -; SI-NOT: bfe -; SI-NOT: lshr -; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-NOT: bfe -; SI-NOT: lshr -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 +; GCN-LABEL: {{^}}load_v8i8_to_v8f32: +; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, +; GCN-NOT: bfe +; GCN-NOT: lshr +; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] +; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]] +; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]] +; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]] +; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]] +; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]] +; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]] +; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] +; GCN-NOT: bfe +; GCN-NOT: lshr +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> @@ -148,11 +154,11 @@ ret void } -; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32: -; SI: buffer_load_dword [[LOADREG:v[0-9]+]], -; SI: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] -; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] -; SI: buffer_store_dword [[CONV]], +; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32: +; GCN: buffer_load_dword [[LOADREG:v[0-9]+]], +; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] +; GCN: buffer_store_dword [[CONV]], define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 2 @@ -162,7 +168,7 @@ ret void } -; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: +; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %inreg = and i32 %load, 65280 @@ -174,7 +180,7 @@ ; We don't get these ones because of the zext, but instcombine removes ; them so it shouldn't really matter. -; SI-LABEL: {{^}}i8_zext_i32_to_f32: +; GCN-LABEL: {{^}}i8_zext_i32_to_f32: define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { %load = load i8, i8 addrspace(1)* %in, align 1 %ext = zext i8 %load to i32 @@ -183,7 +189,7 @@ ret void } -; SI-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: +; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %ext = zext <4 x i8> %load to <4 x i32> @@ -192,11 +198,11 @@ ret void } -; SI-LABEL: {{^}}extract_byte0_to_f32: -; SI: buffer_load_dword [[VAL:v[0-9]+]] -; SI-NOT: [[VAL]] -; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[CONV]] +; GCN-LABEL: {{^}}extract_byte0_to_f32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN-NOT: [[VAL]] +; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[CONV]] define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %and = and i32 %val, 255 @@ -205,11 +211,11 @@ ret void } -; SI-LABEL: {{^}}extract_byte1_to_f32: -; SI: buffer_load_dword [[VAL:v[0-9]+]] -; SI-NOT: [[VAL]] -; SI: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[CONV]] +; GCN-LABEL: {{^}}extract_byte1_to_f32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN-NOT: [[VAL]] +; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[CONV]] define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 8 @@ -219,11 +225,11 @@ ret void } -; SI-LABEL: {{^}}extract_byte2_to_f32: -; SI: buffer_load_dword [[VAL:v[0-9]+]] -; SI-NOT: [[VAL]] -; SI: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[CONV]] +; GCN-LABEL: {{^}}extract_byte2_to_f32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN-NOT: [[VAL]] +; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[CONV]] define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 16 @@ -233,11 +239,11 @@ ret void } -; SI-LABEL: {{^}}extract_byte3_to_f32: -; SI: buffer_load_dword [[VAL:v[0-9]+]] -; SI-NOT: [[VAL]] -; SI: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[CONV]] +; GCN-LABEL: {{^}}extract_byte3_to_f32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN-NOT: [[VAL]] +; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[CONV]] define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { %val = load i32, i32 addrspace(1)* %in %srl = lshr i32 %val, 24 Index: test/CodeGen/AMDGPU/global-extload-i16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global-extload-i16.ll @@ -0,0 +1,302 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; FIXME: cypress is broken because the bigger testcases spill and it's not implemented + +; FUNC-LABEL: {{^}}zextload_global_i16_to_i32: +; SI: buffer_load_ushort +; SI: buffer_store_dword +; SI: s_endpgm +define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = zext i16 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i16_to_i32: +; SI: buffer_load_sshort +; SI: buffer_store_dword +; SI: s_endpgm +define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = sext i16 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32: +; SI: buffer_load_ushort +; SI: s_endpgm +define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = zext <1 x i16> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32: +; SI: buffer_load_sshort +; SI: s_endpgm +define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = sext <1 x i16> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32: +; SI: s_endpgm +define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = zext <2 x i16> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32: +; SI: s_endpgm +define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = sext <2 x i16> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32: +; SI: s_endpgm +define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = zext <4 x i16> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32: +; SI: s_endpgm +define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = sext <4 x i16> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32: +; SI: s_endpgm +define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = zext <8 x i16> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32: +; SI: s_endpgm +define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = sext <8 x i16> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32: +; SI: s_endpgm +define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = zext <16 x i16> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32: +; SI: s_endpgm +define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = sext <16 x i16> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32: +; SI: s_endpgm +define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = zext <32 x i16> %load to <32 x i32> + store <32 x i32> %ext, <32 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32: +; SI: s_endpgm +define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = sext <32 x i16> %load to <32 x i32> + store <32 x i32> %ext, <32 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32: +; SI: s_endpgm +define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = zext <64 x i16> %load to <64 x i32> + store <64 x i32> %ext, <64 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32: +; SI: s_endpgm +define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = sext <64 x i16> %load to <64 x i32> + store <64 x i32> %ext, <64 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_i16_to_i64: +; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]], +; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] +define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = zext i16 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i16_to_i64: +; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 +; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] +; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 +define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = sext i16 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64: +; SI: s_endpgm +define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = zext <1 x i16> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64: +; SI: s_endpgm +define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = sext <1 x i16> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64: +; SI: s_endpgm +define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = zext <2 x i16> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64: +; SI: s_endpgm +define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = sext <2 x i16> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64: +; SI: s_endpgm +define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = zext <4 x i16> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64: +; SI: s_endpgm +define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = sext <4 x i16> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64: +; SI: s_endpgm +define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = zext <8 x i16> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64: +; SI: s_endpgm +define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = sext <8 x i16> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64: +; SI: s_endpgm +define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = zext <16 x i16> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64: +; SI: s_endpgm +define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = sext <16 x i16> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64: +; SI: s_endpgm +define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = zext <32 x i16> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64: +; SI: s_endpgm +define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = sext <32 x i16> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64: +; SI: s_endpgm +define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = zext <64 x i16> %load to <64 x i64> + store <64 x i64> %ext, <64 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64: +; SI: s_endpgm +define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = sext <64 x i16> %load to <64 x i64> + store <64 x i64> %ext, <64 x i64> addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -379,19 +379,33 @@ ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: -; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} -; GCN-DAG: v_cvt_f32_f16_e32 -; GCN-DAG: v_cvt_f32_f16_e32 - -; GCN: v_cvt_f64_f32_e32 -; GCN: v_cvt_f64_f32_e32 -; GCN: v_cvt_f64_f32_e32 +; XSI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; XSI: v_cvt_f32_f16_e32 +; XSI: v_cvt_f32_f16_e32 +; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} +; XSI: v_cvt_f32_f16_e32 +; XSI-NOT: v_cvt_f32_f16 + +; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; XVI: v_cvt_f32_f16_e32 +; XVI: v_cvt_f32_f16_e32 +; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} +; XVI: v_cvt_f32_f16_e32 +; XVI-NOT: v_cvt_f32_f16 + +; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] +; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] +; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] +; GCN: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] +; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] + +; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] +; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] +; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] ; GCN-NOT: v_cvt_f64_f32_e32 -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 ; GCN: s_endpgm define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { %val = load <3 x half>, <3 x half> addrspace(1)* %in Index: test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone @@ -73,11 +74,14 @@ } ; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8: -; SI: buffer_load_dword +; GCN: buffer_load_dword ; SI: v_add_i32 ; SI-NEXT: v_and_b32_e32 +; FIXME: Should be using s_add_i32 +; VI: v_add_i32 +; VI-NEXT: v_and_b32_e32 ; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm +; GCN: s_endpgm define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %load = load i32, i32 addrspace(1)* %in, align 4 %add = add i32 %load, 1 Index: test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i16.ll +++ test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}constant_load_i16: @@ -428,8 +428,15 @@ } ; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64: -; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]], +; FIXME: Need to optimize this sequence to avoid extra bfe: +; t28: i32,ch = load t12, t27, undef:i64 +; t31: i64 = any_extend t28 +; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16 + +; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]], ; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]], +; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]], +; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] Index: test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i16.ll +++ test/CodeGen/AMDGPU/load-global-i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -444,8 +444,15 @@ } ; FUNC-LABEL: {{^}}global_sextload_i16_to_i64: -; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]], +; FIXME: Need to optimize this sequence to avoid extra bfe: +; t28: i32,ch = load t12, t27, undef:i64 +; t31: i64 = any_extend t28 +; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16 + +; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]], ; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]], +; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]], +; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] Index: test/CodeGen/AMDGPU/load-global-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i8.ll +++ test/CodeGen/AMDGPU/load-global-i8.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -163,7 +163,8 @@ ; GCN-NOHSA: buffer_load_dword v ; GCN-HSA: flat_load_dword v -; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, @@ -185,7 +186,16 @@ ; GCN-NOHSA: buffer_load_dword v ; GCN-HSA: flat_load_dword v -; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +;FIXME: Need to optimize this sequence to avoid extra shift on VI. + +; t23: i16 = truncate t18 +; t49: i16 = srl t23, Constant:i32<8> +; t57: i32 = any_extend t49 +; t58: i32 = sign_extend_inreg t57, ValueType:ch:i8 + +; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}} +; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 Index: test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i16.ll +++ test/CodeGen/AMDGPU/load-local-i16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_i16: @@ -539,7 +539,13 @@ } ; FUNC-LABEL: {{^}}local_sextload_i16_to_i64: -; GCN: ds_read_i16 v[[LO:[0-9]+]], +; FIXME: Need to optimize this sequence to avoid an extra shift. +; t25: i32,ch = load t12, t10, undef:i32 +; t28: i64 = any_extend t25 +; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16 +; SI: ds_read_i16 v[[LO:[0-9]+]], +; VI: ds_read_u16 v[[ULO:[0-9]+]] +; VI: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]] Index: test/CodeGen/AMDGPU/load-local-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i8.ll +++ test/CodeGen/AMDGPU/load-local-i8.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -141,8 +141,17 @@ ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 ; GCN: ds_read_u16 -; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 -; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; FIXME: Need to optimize this sequence to avoid extra shift on VI. +; t23: i16 = srl t39, Constant:i32<8> +; t31: i32 = any_extend t23 +; t33: i32 = sign_extend_inreg t31, ValueType:ch:i8 + +; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 + +; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}} +; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8 ; EG: LDS_USHORT_READ_RET ; EG-DAG: BFE_INT @@ -157,7 +166,8 @@ ; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32: ; GCN: ds_read_b32 -; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}} ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, Index: test/CodeGen/AMDGPU/mad_uint24.ll =================================================================== --- test/CodeGen/AMDGPU/mad_uint24.ll +++ test/CodeGen/AMDGPU/mad_uint24.ll @@ -1,11 +1,15 @@ -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; FUNC-LABEL: {{^}}u32_mad24: ; EG: MULADD_UINT24 ; SI: v_mad_u32_u24 +; VI: v_mad_u32_u24 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: @@ -25,9 +29,9 @@ ; The result must be sign-extended ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x ; EG: 16 -; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 - +; FIXME: Should be using scalar instructions here. +; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { entry: %0 = mul i16 %a, %b @@ -37,14 +41,14 @@ ret void } +; FIXME: Need to handle non-uniform case for function below (load without gep). ; FUNC-LABEL: {{^}}i8_mad24: ; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] ; The result must be sign-extended ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x ; EG: 8 -; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 - +; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { entry: %0 = mul i8 %a, %b Index: test/CodeGen/AMDGPU/max.i16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/max.i16.ll @@ -0,0 +1,87 @@ +; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s + + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_imax_sge_i16: +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp sge i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_imax_sge_v4i16: +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid + %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep0, align 4 + %b = load <4 x i16>, <4 x i16> addrspace(1)* %gep1, align 4 + %cmp = icmp sge <4 x i16> %a, %b + %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b + store <4 x i16> %val, <4 x i16> addrspace(1)* %outgep, align 4 + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_imax_sgt_i16: +; VI: v_max_i16_e32 +define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp sgt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_umax_uge_i16: +; VI: v_max_u16_e32 +define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp uge i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_umax_ugt_i16: +; VI: v_max_u16_e32 +define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %a = load i16, i16 addrspace(1)* %gep0, align 4 + %b = load i16, i16 addrspace(1)* %gep1, align 4 + %cmp = icmp ugt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %outgep, align 4 + ret void +} Index: test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll =================================================================== --- test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -31,7 +31,8 @@ } ; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext: -; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -62,8 +63,9 @@ } ; FUNC-LABEL: {{^}}test_umul24_i16_vgpr: -; GCN: v_mul_u32_u24_e32 -; GCN: v_and_b32_e32 +; SI: v_mul_u32_u24_e32 +; SI: v_and_b32_e32 +; VI: v_mul_lo_u16 define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %tid.y = call i32 @llvm.amdgcn.workitem.id.y() @@ -77,9 +79,9 @@ ret void } -; FIXME: Need to handle non-uniform case for function below (load without gep). ; FUNC-LABEL: {{^}}test_umul24_i8_vgpr: -; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) { entry: Index: test/CodeGen/AMDGPU/shl.ll =================================================================== --- test/CodeGen/AMDGPU/shl.ll +++ test/CodeGen/AMDGPU/shl.ll @@ -53,6 +53,48 @@ ret void } +;VI: {{^}}shl_i16: +;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} + +define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %a = load i16, i16 addrspace(1) * %in + %b = load i16, i16 addrspace(1) * %b_ptr + %result = shl i16 %a, %b + store i16 %result, i16 addrspace(1)* %out + ret void +} + + +;VI: {{^}}shl_v2i16: +;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} +;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} + +define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 + %a = load <2 x i16>, <2 x i16> addrspace(1) * %in + %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr + %result = shl <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + + +;VI: {{^}}shl_v4i16: +;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} +;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} +;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} +;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}} + +define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 + %a = load <4 x i16>, <4 x i16> addrspace(1) * %in + %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr + %result = shl <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out + ret void +} + ;EG-LABEL: {{^}}shl_i64: ;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] ;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} Index: test/CodeGen/AMDGPU/sign_extend.ll =================================================================== --- test/CodeGen/AMDGPU/sign_extend.ll +++ test/CodeGen/AMDGPU/sign_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 @@ -55,22 +55,43 @@ } ; GCN-LABEL: {{^}}s_sext_i16_to_i64: -; GCN: s_endpgm +; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000 define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { %sext = sext i16 %a to i64 store i64 %sext, i64 addrspace(1)* %out, align 8 ret void } +; GCN-LABEL: {{^}}s_sext_i1_to_i16: +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1 +; GCN-NEXT: buffer_store_short [[RESULT]] +define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp eq i32 %a, %b + %sext = sext i1 %cmp to i16 + store i16 %sext, i16 addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32: ; GCN: s_load_dword [[VAL:s[0-9]+]] -; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]] -; GCN-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008 ; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010 ; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24 +; SI-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008 +; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]] + +; FIXME: We end up with a v_bfe instruction, because the i16 srl +; gets selected to a v_lshrrev_b16 instructions, so the input to +; the bfe is a vector registers. To fix this we need to be able to +; optimize: +; t29: i16 = truncate t10 +; t55: i16 = srl t29, Constant:i32<8> +; t63: i32 = any_extend t55 +; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8 + +; VI-DAG: v_bfe_i32 [[VEXT1:v[0-9]+]], v{{[0-9]+}}, 0, 8 ; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]] -; GCN-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]] +; SI-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]] ; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]] ; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]] @@ -96,10 +117,17 @@ ; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 -; GCN-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8 -; GCN-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 +; FIXME: need to optimize same sequence as above test to avoid +; this shift. +; VI-DAG: v_lshrrev_b16_e32 [[SH16:v[0-9]+]], 8, [[VAL]] ; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]] +; VI-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 +; VI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 +; VI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[SH16]], 0, 8 + +; SI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8 +; SI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8 +; SI: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8 ; GCN: buffer_store_dword [[EXT0]] ; GCN: buffer_store_dword [[EXT1]] Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -46,6 +46,36 @@ ret void } +; FUNC-LABEL: {{^}}ashr_v2i16: +; FIXME: The ashr operation is uniform, but because its operands come from a +; global load we end up with the vector instructions rather than scalar. +; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr + %result = ashr <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ashr_v4i16: +; FIXME: The ashr operation is uniform, but because its operands come from a +; global load we end up with the vector instructions rather than scalar. +; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 + %a = load <4 x i16>, <4 x i16> addrspace(1)* %in + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr + %result = ashr <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}s_ashr_i64: ; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -54,6 +54,46 @@ ret void } +; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %a = load i16, i16 addrspace(1)* %in + %b = load i16, i16 addrspace(1)* %b_ptr + %result = sub i16 %a, %b + store i16 %result, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_sub_v2i16: + +; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 + %a = load <2 x i16>, <2 x i16> addrspace(1) * %in + %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr + %result = sub <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_sub_v4i16: + +; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 + %a = load <4 x i16>, <4 x i16> addrspace(1) * %in + %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr + %result = sub <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}s_sub_i64: ; SI: s_sub_u32 ; SI: s_subb_u32 Index: test/CodeGen/AMDGPU/trunc-bitcast-vector.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32: ; CHECK: buffer_load_dword v @@ -47,7 +47,12 @@ } ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16: -; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; FIXME We need to teach the dagcombiner to reduce load width for: +; t21: v2i32,ch = load t12, t10, undef:i64 +; t23: i64 = bitcast t21 +; t30: i16 = truncate t23 +; SI: buffer_load_dword v[[VAL:[0-9]+]] +; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]] ; CHECK: buffer_store_short [[VAL]] define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in Index: test/CodeGen/AMDGPU/trunc-store-i1.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store-i1.ll +++ test/CodeGen/AMDGPU/trunc-store-i1.ll @@ -21,13 +21,20 @@ ret void } -; SI-LABEL: {{^}}global_truncstore_i16_to_i1: +; SI-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1: ; SI: s_load_dword [[LOAD:s[0-9]+]], ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] ; SI: buffer_store_byte [[VREG]], -define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { +define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { %trunc = trunc i16 %val to i1 store i1 %trunc, i1 addrspace(1)* %out, align 1 ret void } +; SI-LABEL: {{^}}global_truncstore_i16_to_i1: +define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind { + %add = add i16 %val0, %val1 + %trunc = trunc i16 %add to i1 + store i1 %trunc, i1 addrspace(1)* %out, align 1 + ret void +} Index: test/CodeGen/AMDGPU/zero_extend.ll =================================================================== --- test/CodeGen/AMDGPU/zero_extend.ll +++ test/CodeGen/AMDGPU/zero_extend.ll @@ -2,39 +2,58 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 -; R600: {{^}}test: +; R600: {{^}}s_mad_zext_i32_to_i64: ; R600: MEM_RAT_CACHELESS STORE_RAW ; R600: MEM_RAT_CACHELESS STORE_RAW -; SI: {{^}}test: +; SI: {{^}}s_mad_zext_i32_to_i64: ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}} ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} -define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 { entry: - %0 = mul i32 %a, %b - %1 = add i32 %0, %c - %2 = zext i32 %1 to i64 - store i64 %2, i64 addrspace(1)* %out + %tmp0 = mul i32 %a, %b + %tmp1 = add i32 %tmp0, %c + %tmp2 = zext i32 %tmp1 to i64 + store i64 %tmp2, i64 addrspace(1)* %out ret void } -; SI-LABEL: {{^}}testi1toi32: +; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32 ; SI: v_cndmask_b32 -define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { entry: - %0 = icmp eq i32 %a, %b - %1 = zext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out + %tmp0 = icmp eq i32 %a, %b + %tmp1 = zext i1 %tmp0 to i32 + store i32 %tmp1, i32 addrspace(1)* %out ret void } -; SI-LABEL: {{^}}zext_i1_to_i64: +; SI-LABEL: {{^}}s_arg_zext_i1_to_i64: +define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 { + %ext = zext i1 %arg to i64 + store i64 %ext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_cmp_zext_i1_to_i64: ; SI: s_mov_b32 s{{[0-9]+}}, 0 ; SI: v_cmp_eq_u32 ; SI: v_cndmask_b32 -define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp eq i32 %a, %b %ext = zext i1 %cmp to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 ret void } + +; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16 +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; SI: buffer_store_short [[RESULT]] +define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { + %tmp0 = icmp eq i16 %a, %b + %tmp1 = zext i1 %tmp0 to i16 + store i16 %tmp1, i16 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind }