diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10863,7 +10863,7 @@ // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x if (auto *C = dyn_cast(Shift.getOperand(1))) { - Shift = DAG.getZExtOrTrunc(Shift.getOperand(0), + SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32); unsigned ShiftOffset = 8 * Offset; @@ -10874,7 +10874,7 @@ if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL, - MVT::f32, Shift); + MVT::f32, Shifted); } } } diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -823,18 +823,16 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -865,7 +863,7 @@ ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -888,7 +886,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] @@ -1038,31 +1036,29 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5 ; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v9, v6 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1078,39 +1074,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: flat_load_ubyte v11, v[6:7] -; VI-NEXT: flat_load_ubyte v8, v[8:9] +; VI-NEXT: flat_load_ubyte v12, v[4:5] ; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 2, v0 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v8, v[8:9] +; VI-NEXT: flat_load_ubyte v9, v[10:11] ; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] -; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v7, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v12 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v9 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11 -; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1119,32 +1115,31 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_short_d16 v4, v0, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v5 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 -; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 -; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX10-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid @@ -1417,18 +1412,16 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1444,24 +1437,24 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v5, v[6:7] -; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v6, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -1470,23 +1463,22 @@ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 ; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 8, v2 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte_vector.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte_vector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte_vector.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s + +%Vec = type { [4 x i8] } + +define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr { +; CHECK-LABEL: cvt_f32_ubyte0_vector: +; CHECK: ; %bb.0: ; %entry +; CHECK: flat_load_ubyte [[REG0:v[0-9]+]], v[1:2] +; CHECK: flat_load_ubyte [[REG1:v[0-9]+]], v[1:2] offset:1 +; CHECK: flat_load_ubyte [[REG2:v[0-9]+]], v[1:2] offset:2 +; CHECK: flat_load_ubyte [[REG3:v[0-9]+]], v[1:2] offset:3 +; CHECK: v_cvt_f32_ubyte0_e32 [[CVT:v[0-9]+]], [[REG3]] +; CHECK: v_fma_f32 v2, v7, [[CVT]], 0.5 +entry: + br label %for.body.i + +for.body.i: ; preds = %for.body.i, %entry + %retval.sroa.0.0.copyload.i.i.i.i.i.i = load %Vec*, %Vec* addrspace(1)* undef, align 8 + %add.ptr.i.i.i.i.i.i.i = getelementptr inbounds %Vec, %Vec* %retval.sroa.0.0.copyload.i.i.i.i.i.i, i64 undef + %retval.sroa.0.0..sroa_cast.i.i.i.i.i.i.i.i = bitcast %Vec* %add.ptr.i.i.i.i.i.i.i to i32* + %retval.sroa.0.0.copyload.i.i.i.i.i.i.i.i = load i32, i32* %retval.sroa.0.0..sroa_cast.i.i.i.i.i.i.i.i, align 1 + %p1.sroa.6.0.extract.shift.i.i.i.i = lshr i32 %retval.sroa.0.0.copyload.i.i.i.i.i.i.i.i, 24 + %p1.sroa.6.0.extract.trunc.i.i.i.i = trunc i32 %p1.sroa.6.0.extract.shift.i.i.i.i to i8 + %conv12.i.i.i.i.i = uitofp i8 %p1.sroa.6.0.extract.trunc.i.i.i.i to float + %0 = load float, float addrspace(1)* undef, align 8 + %mul.i.i.i.i.i = fmul contract float %0, %conv12.i.i.i.i.i + %add.i.i.i.i.i = fadd contract float %mul.i.i.i.i.i, 5.000000e-01 + %conv13.i.i.i.i.i = fptoui float %add.i.i.i.i.i to i8 + %retval.sroa.4.0.insert.ext.i.i.i.i.i = zext i8 %conv13.i.i.i.i.i to i32 + %retval.sroa.4.0.insert.shift.i.i.i.i.i = shl nuw i32 %retval.sroa.4.0.insert.ext.i.i.i.i.i, 24 + %retval.sroa.3.0.insert.ext.i.i.i.i.i = and i32 %retval.sroa.0.0.copyload.i.i.i.i.i.i.i.i, 16711680 + %retval.sroa.3.0.insert.insert.i.i.i.i.i = or i32 %retval.sroa.4.0.insert.shift.i.i.i.i.i, %retval.sroa.3.0.insert.ext.i.i.i.i.i + %retval.sroa.2.0.insert.ext.i.i.i.i.i = and i32 %retval.sroa.0.0.copyload.i.i.i.i.i.i.i.i, undef + %retval.sroa.2.0.insert.insert.i.i.i.i.i = or i32 %retval.sroa.3.0.insert.insert.i.i.i.i.i, %retval.sroa.2.0.insert.ext.i.i.i.i.i + %retval.sroa.0.0.insert.ext.i.i.i.i.i = and i32 %retval.sroa.0.0.copyload.i.i.i.i.i.i.i.i, 255 + %retval.sroa.0.0.insert.insert.i.i.i.i.i = or i32 %retval.sroa.2.0.insert.insert.i.i.i.i.i, %retval.sroa.0.0.insert.ext.i.i.i.i.i + store i32 %retval.sroa.0.0.insert.insert.i.i.i.i.i, i32* undef, align 1 + br label %for.body.i +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.amdgcn.workitem.id.y() #0 + +attributes #0 = { nounwind readnone speculatable willreturn }