# Changeset View

Changeset View

# Standalone View

Standalone View

# llvm/trunk/test/CodeGen/AMDGPU/idot2.ll

1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||||
---|---|---|---|---|---|

2 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s | ||||

3 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89 %s | ||||

4 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89 %s | ||||

5 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-DL %s | ||||

6 | | ||||

7 | ; add(mul(S0.x, S1.y), | ||||

8 | ; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) | ||||

9 | | ||||

10 | define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, | ||||

11 | ; GFX7-LABEL: udot2: | ||||

12 | ; GFX7: ; %bb.0: ; %entry | ||||

13 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

14 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

15 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

16 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

17 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

18 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

19 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

20 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

21 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

22 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

23 | ; GFX7-NEXT: s_lshr_b32 s7, s4, 16 | ||||

24 | ; GFX7-NEXT: s_lshr_b32 s9, s5, 16 | ||||

25 | ; GFX7-NEXT: s_and_b32 s4, s4, s8 | ||||

26 | ; GFX7-NEXT: v_mov_b32_e32 v0, s7 | ||||

27 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

28 | ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1 | ||||

29 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

30 | ; GFX7-NEXT: v_mov_b32_e32 v1, s4 | ||||

31 | ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 | ||||

32 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

33 | ; GFX7-NEXT: s_endpgm | ||||

34 | ; | ||||

35 | ; GFX89-LABEL: udot2: | ||||

36 | ; GFX89: ; %bb.0: ; %entry | ||||

37 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

38 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

39 | ; GFX89-NEXT: s_mov_b32 s2, 0xffff | ||||

40 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

41 | ; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

42 | ; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

43 | ; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

44 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

45 | ; GFX89-NEXT: s_and_b32 s6, s3, s2 | ||||

46 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

47 | ; GFX89-NEXT: s_and_b32 s2, s4, s2 | ||||

48 | ; GFX89-NEXT: s_lshr_b32 s4, s4, 16 | ||||

49 | ; GFX89-NEXT: v_mov_b32_e32 v0, s5 | ||||

50 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

51 | ; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

52 | ; GFX89-NEXT: v_mov_b32_e32 v1, s6 | ||||

53 | ; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0 | ||||

54 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

55 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

56 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

57 | ; GFX89-NEXT: s_endpgm | ||||

58 | ; | ||||

59 | ; GCN-DL-LABEL: udot2: | ||||

60 | ; GCN-DL: ; %bb.0: ; %entry | ||||

61 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

62 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

63 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

64 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

65 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

66 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

67 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

68 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

69 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

70 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s2 | ||||

71 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 | ||||

72 | ; GCN-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 | ||||

73 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

74 | ; GCN-DL-NEXT: s_endpgm | ||||

75 | <2 x i16> addrspace(1)* %src2, | ||||

76 | i32 addrspace(1)* nocapture %dst) { | ||||

77 | entry: | ||||

78 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

79 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

80 | | ||||

81 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

82 | %conv = zext i16 %s1.elt1 to i32 | ||||

83 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

84 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

85 | %mul1 = mul nuw i32 %conv2, %conv | ||||

86 | | ||||

87 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

88 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

89 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

90 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

91 | %mul2 = mul nuw i32 %conv4, %conv3 | ||||

92 | | ||||

93 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

94 | %add = add i32 %mul2, %s3 | ||||

95 | %add6 = add i32 %add, %mul1 | ||||

96 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

97 | ret void | ||||

98 | } | ||||

99 | | ||||

100 | ; TODO: Support this pattern | ||||

101 | ; add(S3, | ||||

102 | ; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) | ||||

103 | define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, | ||||

104 | ; GFX7-LABEL: udot2_MulMul: | ||||

105 | ; GFX7: ; %bb.0: ; %entry | ||||

106 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

107 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

108 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

109 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

110 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

111 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

112 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

113 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

114 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

115 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

116 | ; GFX7-NEXT: s_lshr_b32 s7, s4, 16 | ||||

117 | ; GFX7-NEXT: s_and_b32 s4, s4, s8 | ||||

118 | ; GFX7-NEXT: s_lshr_b32 s9, s5, 16 | ||||

119 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

120 | ; GFX7-NEXT: v_mov_b32_e32 v0, s4 | ||||

121 | ; GFX7-NEXT: v_mul_u32_u24_e32 v0, s5, v0 | ||||

122 | ; GFX7-NEXT: v_mov_b32_e32 v1, s7 | ||||

123 | ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 | ||||

124 | ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s6, v0 | ||||

125 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

126 | ; GFX7-NEXT: s_endpgm | ||||

127 | ; | ||||

128 | ; GFX89-LABEL: udot2_MulMul: | ||||

129 | ; GFX89: ; %bb.0: ; %entry | ||||

130 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

131 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

132 | ; GFX89-NEXT: s_mov_b32 s2, 0xffff | ||||

133 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

134 | ; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

135 | ; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

136 | ; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

137 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

138 | ; GFX89-NEXT: s_and_b32 s6, s3, s2 | ||||

139 | ; GFX89-NEXT: s_and_b32 s2, s4, s2 | ||||

140 | ; GFX89-NEXT: v_mov_b32_e32 v0, s6 | ||||

141 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

142 | ; GFX89-NEXT: s_lshr_b32 s4, s4, 16 | ||||

143 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

144 | ; GFX89-NEXT: v_mul_u32_u24_e32 v0, s2, v0 | ||||

145 | ; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

146 | ; GFX89-NEXT: v_add_u32_e32 v2 | ||||

147 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

148 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

149 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

150 | ; GFX89-NEXT: s_endpgm | ||||

151 | ; | ||||

152 | ; GCN-DL-LABEL: udot2_MulMul: | ||||

153 | ; GCN-DL: ; %bb.0: ; %entry | ||||

154 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

155 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

156 | ; GCN-DL-NEXT: s_mov_b32 s2, 0xffff | ||||

157 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

158 | ; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

159 | ; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

160 | ; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

161 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

162 | ; GCN-DL-NEXT: s_and_b32 s6, s3, s2 | ||||

163 | ; GCN-DL-NEXT: s_and_b32 s2, s4, s2 | ||||

164 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s6 | ||||

165 | ; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16 | ||||

166 | ; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16 | ||||

167 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s3 | ||||

168 | ; GCN-DL-NEXT: v_mul_u32_u24_e32 v0, s2, v0 | ||||

169 | ; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

170 | ; GCN-DL-NEXT: v_add_u32_e32 v2, s5, v0 | ||||

171 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

172 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

173 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

174 | ; GCN-DL-NEXT: s_endpgm | ||||

175 | <2 x i16> addrspace(1)* %src2, | ||||

176 | i32 addrspace(1)* nocapture %dst) { | ||||

177 | entry: | ||||

178 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

179 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

180 | | ||||

181 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

182 | %conv = zext i16 %s1.elt1 to i32 | ||||

183 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

184 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

185 | %mul1 = mul nuw i32 %conv2, %conv | ||||

186 | | ||||

187 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

188 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

189 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

190 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

191 | %mul2 = mul nuw i32 %conv4, %conv3 | ||||

192 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

193 | %add = add i32 %mul2, %mul1 | ||||

194 | %add6 = add i32 %add, %s3 | ||||

195 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

196 | ret void | ||||

197 | } | ||||

198 | | ||||

199 | define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, | ||||

200 | ; GFX7-LABEL: idot2: | ||||

201 | ; GFX7: ; %bb.0: ; %entry | ||||

202 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

203 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

204 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

205 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

206 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

207 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

208 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

209 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

210 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

211 | ; GFX7-NEXT: s_sext_i32_i16 s7, s4 | ||||

212 | ; GFX7-NEXT: s_ashr_i32 s4, s4, 16 | ||||

213 | ; GFX7-NEXT: s_sext_i32_i16 s8, s5 | ||||

214 | ; GFX7-NEXT: s_ashr_i32 s5, s5, 16 | ||||

215 | ; GFX7-NEXT: v_mov_b32_e32 v0, s4 | ||||

216 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

217 | ; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1 | ||||

218 | ; GFX7-NEXT: v_mov_b32_e32 v1, s7 | ||||

219 | ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 | ||||

220 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

221 | ; GFX7-NEXT: s_endpgm | ||||

222 | ; | ||||

223 | ; GFX89-LABEL: idot2: | ||||

224 | ; GFX89: ; %bb.0: ; %entry | ||||

225 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

226 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

227 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

228 | ; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

229 | ; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

230 | ; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

231 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

232 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

233 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

234 | ; GFX89-NEXT: s_sext_i32_i16 s0, s2 | ||||

235 | ; GFX89-NEXT: s_ashr_i32 s2, s2, 16 | ||||

236 | ; GFX89-NEXT: s_sext_i32_i16 s1, s3 | ||||

237 | ; GFX89-NEXT: s_ashr_i32 s3, s3, 16 | ||||

238 | ; GFX89-NEXT: v_mov_b32_e32 v2, s4 | ||||

239 | ; GFX89-NEXT: v_mov_b32_e32 v3, s2 | ||||

240 | ; GFX89-NEXT: v_mad_i32_i24 v2, s3, v3, v2 | ||||

241 | ; GFX89-NEXT: v_mov_b32_e32 v3, s0 | ||||

242 | ; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2 | ||||

243 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

244 | ; GFX89-NEXT: s_endpgm | ||||

245 | ; | ||||

246 | ; GCN-DL-LABEL: idot2: | ||||

247 | ; GCN-DL: ; %bb.0: ; %entry | ||||

248 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

249 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

250 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

251 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

252 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

253 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

254 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

255 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

256 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

257 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s2 | ||||

258 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 | ||||

259 | ; GCN-DL-NEXT: v_dot2_i32_i16 v2, s3, v2, v3 | ||||

260 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

261 | ; GCN-DL-NEXT: s_endpgm | ||||

262 | <2 x i16> addrspace(1)* %src2, | ||||

263 | i32 addrspace(1)* nocapture %dst) { | ||||

264 | entry: | ||||

265 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

266 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

267 | | ||||

268 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

269 | %conv = sext i16 %s1.elt1 to i32 | ||||

270 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

271 | %conv2 = sext i16 %s2.elt1 to i32 | ||||

272 | %mul1 = mul nuw i32 %conv2, %conv | ||||

273 | | ||||

274 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

275 | %conv3 = sext i16 %s1.elt2 to i32 | ||||

276 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

277 | %conv4 = sext i16 %s2.elt2 to i32 | ||||

278 | %mul2 = mul nuw i32 %conv4, %conv3 | ||||

279 | | ||||

280 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

281 | %add = add i32 %mul2, %s3 | ||||

282 | %add6 = add i32 %add, %mul1 | ||||

283 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

284 | ret void | ||||

285 | } | ||||

286 | | ||||

287 | define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, | ||||

288 | ; GFX7-LABEL: idot2_MixedTypedMul: | ||||

289 | ; GFX7: ; %bb.0: ; %entry | ||||

290 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

291 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

292 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

293 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

294 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

295 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

296 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

297 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

298 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

299 | ; GFX7-NEXT: s_lshr_b32 s7, s4, 16 | ||||

300 | ; GFX7-NEXT: s_lshr_b32 s8, s5, 16 | ||||

301 | ; GFX7-NEXT: s_sext_i32_i16 s4, s4 | ||||

302 | ; GFX7-NEXT: v_mov_b32_e32 v0, s7 | ||||

303 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

304 | ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v0, v1 | ||||

305 | ; GFX7-NEXT: s_sext_i32_i16 s5, s5 | ||||

306 | ; GFX7-NEXT: v_mov_b32_e32 v1, s4 | ||||

307 | ; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0 | ||||

308 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

309 | ; GFX7-NEXT: s_endpgm | ||||

310 | ; | ||||

311 | ; GFX89-LABEL: idot2_MixedTypedMul: | ||||

312 | ; GFX89: ; %bb.0: ; %entry | ||||

313 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

314 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

315 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

316 | ; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

317 | ; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

318 | ; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

319 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

320 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

321 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

322 | ; GFX89-NEXT: s_sext_i32_i16 s0, s2 | ||||

323 | ; GFX89-NEXT: s_lshr_b32 s2, s2, 16 | ||||

324 | ; GFX89-NEXT: s_sext_i32_i16 s1, s3 | ||||

325 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

326 | ; GFX89-NEXT: v_mov_b32_e32 v2, s4 | ||||

327 | ; GFX89-NEXT: v_mov_b32_e32 v3, s2 | ||||

328 | ; GFX89-NEXT: v_mad_u32_u24 v2, s3, v3, v2 | ||||

329 | ; GFX89-NEXT: v_mov_b32_e32 v3, s0 | ||||

330 | ; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2 | ||||

331 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

332 | ; GFX89-NEXT: s_endpgm | ||||

333 | ; | ||||

334 | ; GCN-DL-LABEL: idot2_MixedTypedMul: | ||||

335 | ; GCN-DL: ; %bb.0: ; %entry | ||||

336 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

337 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

338 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

339 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

340 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

341 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

342 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

343 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

344 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

345 | ; GCN-DL-NEXT: s_sext_i32_i16 s0, s2 | ||||

346 | ; GCN-DL-NEXT: s_lshr_b32 s2, s2, 16 | ||||

347 | ; GCN-DL-NEXT: s_sext_i32_i16 s1, s3 | ||||

348 | ; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16 | ||||

349 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 | ||||

350 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s2 | ||||

351 | ; GCN-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 | ||||

352 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s0 | ||||

353 | ; GCN-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 | ||||

354 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

355 | ; GCN-DL-NEXT: s_endpgm | ||||

356 | <2 x i16> addrspace(1)* %src2, | ||||

357 | i32 addrspace(1)* nocapture %dst) { | ||||

358 | entry: | ||||

359 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

360 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

361 | | ||||

362 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

363 | %conv = sext i16 %s1.elt1 to i32 | ||||

364 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

365 | %conv2 = sext i16 %s2.elt1 to i32 | ||||

366 | %mul1 = mul nuw i32 %conv2, %conv | ||||

367 | | ||||

368 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

369 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

370 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

371 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

372 | %mul2 = mul nuw i32 %conv4, %conv3 | ||||

373 | | ||||

374 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

375 | %add = add i32 %mul2, %s3 | ||||

376 | %add6 = add i32 %add, %mul1 | ||||

377 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

378 | ret void | ||||

379 | } | ||||

380 | | ||||

381 | define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, | ||||

382 | <2 x i16> addrspace(1)* %src2, | ||||

383 | i32 addrspace(1)* nocapture %dst) { | ||||

384 | entry: | ||||

385 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

386 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

387 | | ||||

388 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

389 | %conv = zext i16 %s1.elt1 to i32 | ||||

390 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

391 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

392 | %mul1 = mul nuw i32 %conv2, %conv | ||||

393 | | ||||

394 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

395 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

396 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

397 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

398 | %mul2 = mul nuw i32 %conv4, %conv3 | ||||

399 | | ||||

400 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

401 | %add = add i32 %s3, %mul2 | ||||

402 | %add6 = add i32 %mul1, %add | ||||

403 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

404 | ret void | ||||

405 | } | ||||

406 | | ||||

407 | define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, | ||||

408 | ; GFX7-LABEL: udot2_alt_AddOperands: | ||||

409 | ; GFX7: ; %bb.0: ; %entry | ||||

410 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

411 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

412 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

413 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

414 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

415 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

416 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

417 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

418 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

419 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

420 | ; GFX7-NEXT: s_lshr_b32 s7, s4, 16 | ||||

421 | ; GFX7-NEXT: s_lshr_b32 s9, s5, 16 | ||||

422 | ; GFX7-NEXT: s_and_b32 s4, s4, s8 | ||||

423 | ; GFX7-NEXT: v_mov_b32_e32 v0, s7 | ||||

424 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

425 | ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1 | ||||

426 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

427 | ; GFX7-NEXT: v_mov_b32_e32 v1, s4 | ||||

428 | ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 | ||||

429 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

430 | ; GFX7-NEXT: s_endpgm | ||||

431 | ; | ||||

432 | ; GFX89-LABEL: udot2_alt_AddOperands: | ||||

433 | ; GFX89: ; %bb.0: ; %entry | ||||

434 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

435 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

436 | ; GFX89-NEXT: s_mov_b32 s2, 0xffff | ||||

437 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

438 | ; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

439 | ; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

440 | ; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

441 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

442 | ; GFX89-NEXT: s_and_b32 s6, s3, s2 | ||||

443 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

444 | ; GFX89-NEXT: s_and_b32 s2, s4, s2 | ||||

445 | ; GFX89-NEXT: s_lshr_b32 s4, s4, 16 | ||||

446 | ; GFX89-NEXT: v_mov_b32_e32 v0, s5 | ||||

447 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

448 | ; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

449 | ; GFX89-NEXT: v_mov_b32_e32 v1, s6 | ||||

450 | ; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0 | ||||

451 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

452 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

453 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

454 | ; GFX89-NEXT: s_endpgm | ||||

455 | ; | ||||

456 | ; GCN-DL-LABEL: udot2_alt_AddOperands: | ||||

457 | ; GCN-DL: ; %bb.0: ; %entry | ||||

458 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

459 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

460 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

461 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

462 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

463 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

464 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

465 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

466 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

467 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s2 | ||||

468 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 | ||||

469 | ; GCN-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 | ||||

470 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

471 | ; GCN-DL-NEXT: s_endpgm | ||||

472 | <2 x i16> addrspace(1)* %src2, | ||||

473 | i32 addrspace(1)* nocapture %dst) { | ||||

474 | entry: | ||||

475 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

476 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

477 | | ||||

478 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

479 | %conv = sext i16 %s1.elt1 to i32 | ||||

480 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

481 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

482 | %mul1 = mul nuw i32 %conv2, %conv | ||||

483 | | ||||

484 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

485 | %conv3 = sext i16 %s1.elt2 to i32 | ||||

486 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

487 | %conv4 = sext i16 %s2.elt2 to i32 | ||||

488 | %mul2 = mul nuw i32 %conv4, %conv3 | ||||

489 | | ||||

490 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

491 | %add = add i32 %mul2, %s3 | ||||

492 | %add6 = add i32 %add, %mul1 | ||||

493 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

494 | ret void | ||||

495 | } | ||||

496 | | ||||

497 | define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, | ||||

498 | ; GFX7-LABEL: notudot2_SameVec: | ||||

499 | ; GFX7: ; %bb.0: ; %entry | ||||

500 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

501 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

502 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

503 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

504 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

505 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

506 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

507 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

508 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

509 | ; GFX7-NEXT: s_and_b32 s4, s4, 0xffff | ||||

510 | ; GFX7-NEXT: s_lshr_b32 s5, s5, 16 | ||||

511 | ; GFX7-NEXT: v_mov_b32_e32 v0, s6 | ||||

512 | ; GFX7-NEXT: v_mad_u32_u24 v0, s5, s5, v0 | ||||

513 | ; GFX7-NEXT: v_mad_u32_u24 v0, s4, s4, v0 | ||||

514 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

515 | ; GFX7-NEXT: s_endpgm | ||||

516 | ; | ||||

517 | ; GFX89-LABEL: notudot2_SameVec: | ||||

518 | ; GFX89: ; %bb.0: ; %entry | ||||

519 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

520 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

521 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

522 | ; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

523 | ; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

524 | ; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

525 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

526 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

527 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

528 | ; GFX89-NEXT: s_and_b32 s0, s2, 0xffff | ||||

529 | ; GFX89-NEXT: s_lshr_b32 s1, s3, 16 | ||||

530 | ; GFX89-NEXT: v_mov_b32_e32 v2, s4 | ||||

531 | ; GFX89-NEXT: v_mad_u32_u24 v2, s1, s1, v2 | ||||

532 | ; GFX89-NEXT: v_mad_u32_u24 v2, s0, s0, v2 | ||||

533 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

534 | ; GFX89-NEXT: s_endpgm | ||||

535 | ; | ||||

536 | ; GCN-DL-LABEL: notudot2_SameVec: | ||||

537 | ; GCN-DL: ; %bb.0: ; %entry | ||||

538 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

539 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

540 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

541 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

542 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

543 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

544 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

545 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

546 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

547 | ; GCN-DL-NEXT: s_and_b32 s0, s2, 0xffff | ||||

548 | ; GCN-DL-NEXT: s_lshr_b32 s1, s3, 16 | ||||

549 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 | ||||

550 | ; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 | ||||

551 | ; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 | ||||

552 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

553 | ; GCN-DL-NEXT: s_endpgm | ||||

554 | <2 x i16> addrspace(1)* %src2, | ||||

555 | i32 addrspace(1)* nocapture %dst) { | ||||

556 | entry: | ||||

557 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

558 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

559 | | ||||

560 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

561 | %conv = zext i16 %s1.elt1 to i32 | ||||

562 | %s2.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

563 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

564 | %mul1 = mul i32 %conv2, %conv | ||||

565 | | ||||

566 | %s1.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

567 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

568 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

569 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

570 | %mul2 = mul i32 %conv4, %conv3 | ||||

571 | | ||||

572 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

573 | %add = add i32 %mul2, %s3 | ||||

574 | %add6 = add i32 %add, %mul1 | ||||

575 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

576 | ret void | ||||

577 | } | ||||

578 | | ||||

579 | define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, | ||||

580 | ; GFX7-LABEL: udot2_v4i16: | ||||

581 | ; GFX7: ; %bb.0: ; %entry | ||||

582 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

583 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

584 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

585 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

586 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

587 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

588 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

589 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

590 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

591 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

592 | ; GFX7-NEXT: s_and_b32 s7, s4, s8 | ||||

593 | ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 | ||||

594 | ; GFX7-NEXT: s_and_b32 s8, s5, s8 | ||||

595 | ; GFX7-NEXT: s_lshr_b32 s5, s5, 16 | ||||

596 | ; GFX7-NEXT: v_mov_b32_e32 v0, s4 | ||||

597 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

598 | ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 | ||||

599 | ; GFX7-NEXT: v_mov_b32_e32 v1, s7 | ||||

600 | ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 | ||||

601 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

602 | ; GFX7-NEXT: s_endpgm | ||||

603 | ; | ||||

604 | ; GFX89-LABEL: udot2_v4i16: | ||||

605 | ; GFX89: ; %bb.0: ; %entry | ||||

606 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

607 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

608 | ; GFX89-NEXT: s_mov_b32 s2, 0xffff | ||||

609 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

610 | ; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

611 | ; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

612 | ; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

613 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

614 | ; GFX89-NEXT: s_and_b32 s6, s3, s2 | ||||

615 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

616 | ; GFX89-NEXT: s_and_b32 s2, s4, s2 | ||||

617 | ; GFX89-NEXT: s_lshr_b32 s4, s4, 16 | ||||

618 | ; GFX89-NEXT: v_mov_b32_e32 v0, s5 | ||||

619 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

620 | ; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

621 | ; GFX89-NEXT: v_mov_b32_e32 v1, s6 | ||||

622 | ; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0 | ||||

623 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

624 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

625 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

626 | ; GFX89-NEXT: s_endpgm | ||||

627 | ; | ||||

628 | ; GCN-DL-LABEL: udot2_v4i16: | ||||

629 | ; GCN-DL: ; %bb.0: ; %entry | ||||

630 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

631 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

632 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

633 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

634 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

635 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

636 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

637 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

638 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

639 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s2 | ||||

640 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 | ||||

641 | ; GCN-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 | ||||

642 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

643 | ; GCN-DL-NEXT: s_endpgm | ||||

644 | <4 x i16> addrspace(1)* %src2, | ||||

645 | i32 addrspace(1)* nocapture %dst) { | ||||

646 | entry: | ||||

647 | %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1 | ||||

648 | %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2 | ||||

649 | | ||||

650 | %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 | ||||

651 | %conv = zext i16 %s1.elt1 to i32 | ||||

652 | %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 | ||||

653 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

654 | %mul1 = mul i32 %conv2, %conv | ||||

655 | | ||||

656 | %s1.elt2 = extractelement <4 x i16> %vec1, i64 1 | ||||

657 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

658 | %s2.elt2 = extractelement <4 x i16> %vec2, i64 1 | ||||

659 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

660 | %mul2 = mul i32 %conv4, %conv3 | ||||

661 | | ||||

662 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

663 | %add = add i32 %mul2, %s3 | ||||

664 | %add6 = add i32 %add, %mul1 | ||||

665 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

666 | ret void | ||||

667 | } | ||||

668 | | ||||

669 | define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, | ||||

670 | ; GFX7-LABEL: udot2_v4i16_Hi: | ||||

671 | ; GFX7: ; %bb.0: ; %entry | ||||

672 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

673 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

674 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

675 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

676 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

677 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

678 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x1 | ||||

679 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 | ||||

680 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

681 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

682 | ; GFX7-NEXT: s_and_b32 s7, s4, s8 | ||||

683 | ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 | ||||

684 | ; GFX7-NEXT: s_and_b32 s8, s5, s8 | ||||

685 | ; GFX7-NEXT: s_lshr_b32 s5, s5, 16 | ||||

686 | ; GFX7-NEXT: v_mov_b32_e32 v0, s4 | ||||

687 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

688 | ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 | ||||

689 | ; GFX7-NEXT: v_mov_b32_e32 v1, s7 | ||||

690 | ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 | ||||

691 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

692 | ; GFX7-NEXT: s_endpgm | ||||

693 | ; | ||||

694 | ; GFX89-LABEL: udot2_v4i16_Hi: | ||||

695 | ; GFX89: ; %bb.0: ; %entry | ||||

696 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

697 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

698 | ; GFX89-NEXT: s_mov_b32 s2, 0xffff | ||||

699 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

700 | ; GFX89-NEXT: s_load_dword s3, s[4:5], 0x4 | ||||

701 | ; GFX89-NEXT: s_load_dword s4, s[6:7], 0x4 | ||||

702 | ; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

703 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

704 | ; GFX89-NEXT: s_and_b32 s6, s3, s2 | ||||

705 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

706 | ; GFX89-NEXT: s_and_b32 s2, s4, s2 | ||||

707 | ; GFX89-NEXT: s_lshr_b32 s4, s4, 16 | ||||

708 | ; GFX89-NEXT: v_mov_b32_e32 v0, s5 | ||||

709 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

710 | ; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

711 | ; GFX89-NEXT: v_mov_b32_e32 v1, s6 | ||||

712 | ; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0 | ||||

713 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

714 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

715 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

716 | ; GFX89-NEXT: s_endpgm | ||||

717 | ; | ||||

718 | ; GCN-DL-LABEL: udot2_v4i16_Hi: | ||||

719 | ; GCN-DL: ; %bb.0: ; %entry | ||||

720 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

721 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

722 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

723 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x4 | ||||

724 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x4 | ||||

725 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

726 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

727 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

728 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

729 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s2 | ||||

730 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 | ||||

731 | ; GCN-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 | ||||

732 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

733 | ; GCN-DL-NEXT: s_endpgm | ||||

734 | <4 x i16> addrspace(1)* %src2, | ||||

735 | i32 addrspace(1)* nocapture %dst) { | ||||

736 | entry: | ||||

737 | %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1 | ||||

738 | %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2 | ||||

739 | | ||||

740 | %s1.elt1 = extractelement <4 x i16> %vec1, i64 2 | ||||

741 | %conv = zext i16 %s1.elt1 to i32 | ||||

742 | %s2.elt1 = extractelement <4 x i16> %vec2, i64 2 | ||||

743 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

744 | %mul1 = mul i32 %conv2, %conv | ||||

745 | | ||||

746 | %s1.elt2 = extractelement <4 x i16> %vec1, i64 3 | ||||

747 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

748 | %s2.elt2 = extractelement <4 x i16> %vec2, i64 3 | ||||

749 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

750 | %mul2 = mul i32 %conv4, %conv3 | ||||

751 | | ||||

752 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

753 | %add = add i32 %mul2, %s3 | ||||

754 | %add6 = add i32 %add, %mul1 | ||||

755 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

756 | ret void | ||||

757 | } | ||||

758 | | ||||

759 | define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, | ||||

760 | ; GFX7-LABEL: notudot2_v4i16_Even: | ||||

761 | ; GFX7: ; %bb.0: ; %entry | ||||

762 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

763 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

764 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

765 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

766 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

767 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

768 | ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 | ||||

769 | ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 | ||||

770 | ; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0 | ||||

771 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

772 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

773 | ; GFX7-NEXT: s_and_b32 s4, s4, s8 | ||||

774 | ; GFX7-NEXT: s_and_b32 s7, s7, s8 | ||||

775 | ; GFX7-NEXT: v_mov_b32_e32 v0, s5 | ||||

776 | ; GFX7-NEXT: v_mov_b32_e32 v1, s9 | ||||

777 | ; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 | ||||

778 | ; GFX7-NEXT: s_and_b32 s6, s6, s8 | ||||

779 | ; GFX7-NEXT: v_mov_b32_e32 v1, s4 | ||||

780 | ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 | ||||

781 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

782 | ; GFX7-NEXT: s_endpgm | ||||

783 | ; | ||||

784 | ; GFX89-LABEL: notudot2_v4i16_Even: | ||||

785 | ; GFX89: ; %bb.0: ; %entry | ||||

786 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

787 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

788 | ; GFX89-NEXT: s_mov_b32 s8, 0xffff | ||||

789 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

790 | ; GFX89-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 | ||||

791 | ; GFX89-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 | ||||

792 | ; GFX89-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

793 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

794 | ; GFX89-NEXT: s_and_b32 s3, s3, s8 | ||||

795 | ; GFX89-NEXT: s_and_b32 s2, s2, s8 | ||||

796 | ; GFX89-NEXT: s_and_b32 s5, s5, s8 | ||||

797 | ; GFX89-NEXT: v_mov_b32_e32 v0, s6 | ||||

798 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

799 | ; GFX89-NEXT: v_mad_u32_u24 v0, s5, v1, v0 | ||||

800 | ; GFX89-NEXT: s_and_b32 s4, s4, s8 | ||||

801 | ; GFX89-NEXT: v_mov_b32_e32 v1, s2 | ||||

802 | ; GFX89-NEXT: v_mad_u32_u24 v2, s4, v1, v0 | ||||

803 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

804 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

805 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

806 | ; GFX89-NEXT: s_endpgm | ||||

807 | ; | ||||

808 | ; GCN-DL-LABEL: notudot2_v4i16_Even: | ||||

809 | ; GCN-DL: ; %bb.0: ; %entry | ||||

810 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

811 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

812 | ; GCN-DL-NEXT: s_mov_b32 s8, 0xffff | ||||

813 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

814 | ; GCN-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 | ||||

815 | ; GCN-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 | ||||

816 | ; GCN-DL-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

817 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

818 | ; GCN-DL-NEXT: s_and_b32 s3, s3, s8 | ||||

819 | ; GCN-DL-NEXT: s_and_b32 s2, s2, s8 | ||||

820 | ; GCN-DL-NEXT: s_and_b32 s5, s5, s8 | ||||

821 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s6 | ||||

822 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s3 | ||||

823 | ; GCN-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 | ||||

824 | ; GCN-DL-NEXT: s_and_b32 s4, s4, s8 | ||||

825 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s2 | ||||

826 | ; GCN-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 | ||||

827 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

828 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

829 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

830 | ; GCN-DL-NEXT: s_endpgm | ||||

831 | <4 x i16> addrspace(1)* %src2, | ||||

832 | i32 addrspace(1)* nocapture %dst) { | ||||

833 | entry: | ||||

834 | %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1 | ||||

835 | %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2 | ||||

836 | | ||||

837 | %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 | ||||

838 | %conv = zext i16 %s1.elt1 to i32 | ||||

839 | %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 | ||||

840 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

841 | %mul1 = mul i32 %conv2, %conv | ||||

842 | | ||||

843 | %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 | ||||

844 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

845 | %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 | ||||

846 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

847 | %mul2 = mul i32 %conv4, %conv3 | ||||

848 | | ||||

849 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

850 | %add = add i32 %mul2, %s3 | ||||

851 | %add6 = add i32 %add, %mul1 | ||||

852 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

853 | ret void | ||||

854 | } | ||||

855 | | ||||

856 | define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, | ||||

857 | ; GFX7-LABEL: notudot2_v4i16_Middle: | ||||

858 | ; GFX7: ; %bb.0: ; %entry | ||||

859 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

860 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

861 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

862 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

863 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

864 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

865 | ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 | ||||

866 | ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 | ||||

867 | ; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0 | ||||

868 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

869 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

870 | ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 | ||||

871 | ; GFX7-NEXT: s_and_b32 s7, s7, s8 | ||||

872 | ; GFX7-NEXT: v_mov_b32_e32 v0, s5 | ||||

873 | ; GFX7-NEXT: v_mov_b32_e32 v1, s9 | ||||

874 | ; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 | ||||

875 | ; GFX7-NEXT: s_lshr_b32 s6, s6, 16 | ||||

876 | ; GFX7-NEXT: v_mov_b32_e32 v1, s4 | ||||

877 | ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 | ||||

878 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

879 | ; GFX7-NEXT: s_endpgm | ||||

880 | ; | ||||

881 | ; GFX89-LABEL: notudot2_v4i16_Middle: | ||||

882 | ; GFX89: ; %bb.0: ; %entry | ||||

883 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

884 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

885 | ; GFX89-NEXT: s_mov_b32 s8, 0xffff | ||||

886 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

887 | ; GFX89-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 | ||||

888 | ; GFX89-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 | ||||

889 | ; GFX89-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

890 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

891 | ; GFX89-NEXT: s_and_b32 s3, s3, s8 | ||||

892 | ; GFX89-NEXT: s_lshr_b32 s2, s2, 16 | ||||

893 | ; GFX89-NEXT: s_and_b32 s5, s5, s8 | ||||

894 | ; GFX89-NEXT: v_mov_b32_e32 v0, s6 | ||||

895 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

896 | ; GFX89-NEXT: v_mad_u32_u24 v0, s5, v1, v0 | ||||

897 | ; GFX89-NEXT: s_lshr_b32 s4, s4, 16 | ||||

898 | ; GFX89-NEXT: v_mov_b32_e32 v1, s2 | ||||

899 | ; GFX89-NEXT: v_mad_u32_u24 v2, s4, v1, v0 | ||||

900 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

901 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

902 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

903 | ; GFX89-NEXT: s_endpgm | ||||

904 | ; | ||||

905 | ; GCN-DL-LABEL: notudot2_v4i16_Middle: | ||||

906 | ; GCN-DL: ; %bb.0: ; %entry | ||||

907 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

908 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

909 | ; GCN-DL-NEXT: s_mov_b32 s8, 0xffff | ||||

910 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

911 | ; GCN-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 | ||||

912 | ; GCN-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 | ||||

913 | ; GCN-DL-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

914 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

915 | ; GCN-DL-NEXT: s_and_b32 s3, s3, s8 | ||||

916 | ; GCN-DL-NEXT: s_lshr_b32 s2, s2, 16 | ||||

917 | ; GCN-DL-NEXT: s_and_b32 s5, s5, s8 | ||||

918 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s6 | ||||

919 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s3 | ||||

920 | ; GCN-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 | ||||

921 | ; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16 | ||||

922 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s2 | ||||

923 | ; GCN-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 | ||||

924 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

925 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

926 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

927 | ; GCN-DL-NEXT: s_endpgm | ||||

928 | <4 x i16> addrspace(1)* %src2, | ||||

929 | i32 addrspace(1)* nocapture %dst) { | ||||

930 | entry: | ||||

931 | %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1 | ||||

932 | %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2 | ||||

933 | | ||||

934 | %s1.elt1 = extractelement <4 x i16> %vec1, i64 1 | ||||

935 | %conv = zext i16 %s1.elt1 to i32 | ||||

936 | %s2.elt1 = extractelement <4 x i16> %vec2, i64 1 | ||||

937 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

938 | %mul1 = mul i32 %conv2, %conv | ||||

939 | | ||||

940 | %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 | ||||

941 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

942 | %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 | ||||

943 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

944 | %mul2 = mul i32 %conv4, %conv3 | ||||

945 | | ||||

946 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

947 | %add = add i32 %mul2, %s3 | ||||

948 | %add6 = add i32 %add, %mul1 | ||||

949 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

950 | ret void | ||||

951 | } | ||||

952 | | ||||

953 | define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, | ||||

954 | ; GFX7-LABEL: notudot2_DiffIndex: | ||||

955 | ; GFX7: ; %bb.0: ; %entry | ||||

956 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

957 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

958 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

959 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

960 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

961 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

962 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

963 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

964 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

965 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

966 | ; GFX7-NEXT: s_lshr_b32 s7, s4, 16 | ||||

967 | ; GFX7-NEXT: s_lshr_b32 s9, s5, 16 | ||||

968 | ; GFX7-NEXT: s_and_b32 s4, s4, s8 | ||||

969 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

970 | ; GFX7-NEXT: v_mov_b32_e32 v0, s7 | ||||

971 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

972 | ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 | ||||

973 | ; GFX7-NEXT: v_mov_b32_e32 v1, s4 | ||||

974 | ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 | ||||

975 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

976 | ; GFX7-NEXT: s_endpgm | ||||

977 | ; | ||||

978 | ; GFX89-LABEL: notudot2_DiffIndex: | ||||

979 | ; GFX89: ; %bb.0: ; %entry | ||||

980 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

981 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

982 | ; GFX89-NEXT: s_mov_b32 s2, 0xffff | ||||

983 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

984 | ; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

985 | ; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

986 | ; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

987 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

988 | ; GFX89-NEXT: s_and_b32 s6, s3, s2 | ||||

989 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

990 | ; GFX89-NEXT: s_and_b32 s2, s4, s2 | ||||

991 | ; GFX89-NEXT: v_mov_b32_e32 v0, s5 | ||||

992 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

993 | ; GFX89-NEXT: v_mad_u32_u24 v0, s2, v1, v0 | ||||

994 | ; GFX89-NEXT: s_lshr_b32 s7, s4, 16 | ||||

995 | ; GFX89-NEXT: v_mov_b32_e32 v1, s6 | ||||

996 | ; GFX89-NEXT: v_mad_u32_u24 v2, s7, v1, v0 | ||||

997 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

998 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

999 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

1000 | ; GFX89-NEXT: s_endpgm | ||||

1001 | ; | ||||

1002 | ; GCN-DL-LABEL: notudot2_DiffIndex: | ||||

1003 | ; GCN-DL: ; %bb.0: ; %entry | ||||

1004 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1005 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1006 | ; GCN-DL-NEXT: s_mov_b32 s2, 0xffff | ||||

1007 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1008 | ; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

1009 | ; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

1010 | ; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

1011 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1012 | ; GCN-DL-NEXT: s_and_b32 s6, s3, s2 | ||||

1013 | ; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16 | ||||

1014 | ; GCN-DL-NEXT: s_and_b32 s2, s4, s2 | ||||

1015 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s5 | ||||

1016 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s3 | ||||

1017 | ; GCN-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 | ||||

1018 | ; GCN-DL-NEXT: s_lshr_b32 s7, s4, 16 | ||||

1019 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s6 | ||||

1020 | ; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v1, v0 | ||||

1021 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

1022 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

1023 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

1024 | ; GCN-DL-NEXT: s_endpgm | ||||

1025 | <2 x i16> addrspace(1)* %src2, | ||||

1026 | i32 addrspace(1)* nocapture %dst) { | ||||

1027 | entry: | ||||

1028 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

1029 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

1030 | | ||||

1031 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

1032 | %conv = zext i16 %s1.elt1 to i32 | ||||

1033 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 1 | ||||

1034 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

1035 | %mul1 = mul i32 %conv2, %conv | ||||

1036 | | ||||

1037 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

1038 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

1039 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 0 | ||||

1040 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

1041 | %mul2 = mul i32 %conv4, %conv3 | ||||

1042 | | ||||

1043 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

1044 | %add = add i32 %mul2, %s3 | ||||

1045 | %add6 = add i32 %add, %mul1 | ||||

1046 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

1047 | ret void | ||||

1048 | } | ||||

1049 | | ||||

1050 | define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1, | ||||

1051 | ; GFX7-LABEL: udot2_MultipleUses_add1: | ||||

1052 | ; GFX7: ; %bb.0: ; %entry | ||||

1053 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

1054 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

1055 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

1056 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

1057 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

1058 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1059 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

1060 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

1061 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

1062 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1063 | ; GFX7-NEXT: s_lshr_b32 s7, s4, 16 | ||||

1064 | ; GFX7-NEXT: s_lshr_b32 s9, s5, 16 | ||||

1065 | ; GFX7-NEXT: s_and_b32 s4, s4, s8 | ||||

1066 | ; GFX7-NEXT: v_mov_b32_e32 v0, s7 | ||||

1067 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

1068 | ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1 | ||||

1069 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

1070 | ; GFX7-NEXT: v_mov_b32_e32 v1, s4 | ||||

1071 | ; GFX7-NEXT: v_mad_u32_u24 v1, s5, v1, v0 | ||||

1072 | ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 | ||||

1073 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

1074 | ; GFX7-NEXT: s_endpgm | ||||

1075 | ; | ||||

1076 | ; GFX89-LABEL: udot2_MultipleUses_add1: | ||||

1077 | ; GFX89: ; %bb.0: ; %entry | ||||

1078 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1079 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1080 | ; GFX89-NEXT: s_mov_b32 s2, 0xffff | ||||

1081 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1082 | ; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

1083 | ; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

1084 | ; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

1085 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1086 | ; GFX89-NEXT: s_and_b32 s6, s3, s2 | ||||

1087 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

1088 | ; GFX89-NEXT: s_and_b32 s2, s4, s2 | ||||

1089 | ; GFX89-NEXT: s_lshr_b32 s4, s4, 16 | ||||

1090 | ; GFX89-NEXT: v_mov_b32_e32 v0, s5 | ||||

1091 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

1092 | ; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

1093 | ; GFX89-NEXT: v_mov_b32_e32 v1, s6 | ||||

1094 | ; GFX89-NEXT: v_mad_u32_u24 v1, s2, v1, v0 | ||||

1095 | ; GFX89-NEXT: v_add_u32_e32 v2 | ||||

1096 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

1097 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

1098 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

1099 | ; GFX89-NEXT: s_endpgm | ||||

1100 | ; | ||||

1101 | ; GCN-DL-LABEL: udot2_MultipleUses_add1: | ||||

1102 | ; GCN-DL: ; %bb.0: ; %entry | ||||

1103 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1104 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1105 | ; GCN-DL-NEXT: s_mov_b32 s2, 0xffff | ||||

1106 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1107 | ; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

1108 | ; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

1109 | ; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

1110 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1111 | ; GCN-DL-NEXT: s_and_b32 s6, s3, s2 | ||||

1112 | ; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16 | ||||

1113 | ; GCN-DL-NEXT: s_and_b32 s2, s4, s2 | ||||

1114 | ; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16 | ||||

1115 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s5 | ||||

1116 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s3 | ||||

1117 | ; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

1118 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s6 | ||||

1119 | ; GCN-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v0 | ||||

1120 | ; GCN-DL-NEXT: v_add_u32_e32 v2, v1, v0 | ||||

1121 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

1122 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

1123 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

1124 | ; GCN-DL-NEXT: s_endpgm | ||||

1125 | <2 x i16> addrspace(1)* %src2, | ||||

1126 | i32 addrspace(1)* nocapture %dst) { | ||||

1127 | entry: | ||||

1128 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

1129 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

1130 | | ||||

1131 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

1132 | %conv = zext i16 %s1.elt1 to i32 | ||||

1133 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

1134 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

1135 | %mul1 = mul i32 %conv2, %conv | ||||

1136 | | ||||

1137 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

1138 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

1139 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

1140 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

1141 | %mul2 = mul i32 %conv4, %conv3 | ||||

1142 | | ||||

1143 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

1144 | %add1 = add i32 %mul2, %s3 | ||||

1145 | %add2 = add i32 %add1, %mul1 | ||||

1146 | | ||||

1147 | %res = add i32 %add2, %add1 | ||||

1148 | store i32 %res, i32 addrspace(1)* %dst, align 4 | ||||

1149 | ret void | ||||

1150 | } | ||||

1151 | | ||||

1152 | define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1, | ||||

1153 | ; GFX7-LABEL: idot2_MultipleUses_add1: | ||||

1154 | ; GFX7: ; %bb.0: ; %entry | ||||

1155 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

1156 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

1157 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

1158 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

1159 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1160 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

1161 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

1162 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

1163 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1164 | ; GFX7-NEXT: s_sext_i32_i16 s7, s4 | ||||

1165 | ; GFX7-NEXT: s_ashr_i32 s4, s4, 16 | ||||

1166 | ; GFX7-NEXT: s_sext_i32_i16 s8, s5 | ||||

1167 | ; GFX7-NEXT: s_ashr_i32 s5, s5, 16 | ||||

1168 | ; GFX7-NEXT: v_mov_b32_e32 v0, s4 | ||||

1169 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

1170 | ; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1 | ||||

1171 | ; GFX7-NEXT: v_mov_b32_e32 v1, s7 | ||||

1172 | ; GFX7-NEXT: v_mad_i32_i24 v1, s8, v1, v0 | ||||

1173 | ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 | ||||

1174 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

1175 | ; GFX7-NEXT: s_endpgm | ||||

1176 | ; | ||||

1177 | ; GFX89-LABEL: idot2_MultipleUses_add1: | ||||

1178 | ; GFX89: ; %bb.0: ; %entry | ||||

1179 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1180 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1181 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1182 | ; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

1183 | ; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

1184 | ; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

1185 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

1186 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

1187 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1188 | ; GFX89-NEXT: s_sext_i32_i16 s0, s2 | ||||

1189 | ; GFX89-NEXT: s_ashr_i32 s2, s2, 16 | ||||

1190 | ; GFX89-NEXT: s_sext_i32_i16 s1, s3 | ||||

1191 | ; GFX89-NEXT: s_ashr_i32 s3, s3, 16 | ||||

1192 | ; GFX89-NEXT: v_mov_b32_e32 v2, s4 | ||||

1193 | ; GFX89-NEXT: v_mov_b32_e32 v3, s2 | ||||

1194 | ; GFX89-NEXT: v_mad_i32_i24 v2, s3, v3, v2 | ||||

1195 | ; GFX89-NEXT: v_mov_b32_e32 v3, s0 | ||||

1196 | ; GFX89-NEXT: v_mad_i32_i24 v3, s1, v3, v2 | ||||

1197 | ; GFX89-NEXT: v_add_u32_e32 v2 | ||||

1198 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

1199 | ; GFX89-NEXT: s_endpgm | ||||

1200 | ; | ||||

1201 | ; GCN-DL-LABEL: idot2_MultipleUses_add1: | ||||

1202 | ; GCN-DL: ; %bb.0: ; %entry | ||||

1203 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1204 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1205 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1206 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

1207 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

1208 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

1209 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

1210 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

1211 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1212 | ; GCN-DL-NEXT: s_sext_i32_i16 s0, s2 | ||||

1213 | ; GCN-DL-NEXT: s_ashr_i32 s2, s2, 16 | ||||

1214 | ; GCN-DL-NEXT: s_sext_i32_i16 s1, s3 | ||||

1215 | ; GCN-DL-NEXT: s_ashr_i32 s3, s3, 16 | ||||

1216 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 | ||||

1217 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s2 | ||||

1218 | ; GCN-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 | ||||

1219 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s0 | ||||

1220 | ; GCN-DL-NEXT: v_mad_i32_i24 v3, s1, v3, v2 | ||||

1221 | ; GCN-DL-NEXT: v_add_u32_e32 v2, v3, v2 | ||||

1222 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

1223 | ; GCN-DL-NEXT: s_endpgm | ||||

1224 | <2 x i16> addrspace(1)* %src2, | ||||

1225 | i32 addrspace(1)* nocapture %dst) { | ||||

1226 | entry: | ||||

1227 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

1228 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

1229 | | ||||

1230 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

1231 | %conv = sext i16 %s1.elt1 to i32 | ||||

1232 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

1233 | %conv2 = sext i16 %s2.elt1 to i32 | ||||

1234 | %mul1 = mul i32 %conv2, %conv | ||||

1235 | | ||||

1236 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

1237 | %conv3 = sext i16 %s1.elt2 to i32 | ||||

1238 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

1239 | %conv4 = sext i16 %s2.elt2 to i32 | ||||

1240 | %mul2 = mul i32 %conv4, %conv3 | ||||

1241 | | ||||

1242 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

1243 | %add1 = add i32 %mul2, %s3 | ||||

1244 | %add2 = add i32 %add1, %mul1 | ||||

1245 | | ||||

1246 | %res = add i32 %add2, %add1 | ||||

1247 | store i32 %res, i32 addrspace(1)* %dst, align 4 | ||||

1248 | ret void | ||||

1249 | } | ||||

1250 | | ||||

1251 | define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1, | ||||

1252 | ; GFX7-LABEL: udot2_MultipleUses_mul1: | ||||

1253 | ; GFX7: ; %bb.0: ; %entry | ||||

1254 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

1255 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

1256 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

1257 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

1258 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

1259 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1260 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

1261 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

1262 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

1263 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1264 | ; GFX7-NEXT: s_lshr_b32 s7, s4, 16 | ||||

1265 | ; GFX7-NEXT: s_and_b32 s4, s4, s8 | ||||

1266 | ; GFX7-NEXT: s_lshr_b32 s9, s5, 16 | ||||

1267 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

1268 | ; GFX7-NEXT: v_mov_b32_e32 v0, s4 | ||||

1269 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

1270 | ; GFX7-NEXT: v_mad_u32_u24 v1, s5, v0, v1 | ||||

1271 | ; GFX7-NEXT: v_mov_b32_e32 v2, s7 | ||||

1272 | ; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 | ||||

1273 | ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 | ||||

1274 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

1275 | ; GFX7-NEXT: s_endpgm | ||||

1276 | ; | ||||

1277 | ; GFX89-LABEL: udot2_MultipleUses_mul1: | ||||

1278 | ; GFX89: ; %bb.0: ; %entry | ||||

1279 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1280 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1281 | ; GFX89-NEXT: s_mov_b32 s2, 0xffff | ||||

1282 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1283 | ; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

1284 | ; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

1285 | ; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

1286 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1287 | ; GFX89-NEXT: s_and_b32 s6, s3, s2 | ||||

1288 | ; GFX89-NEXT: s_and_b32 s2, s4, s2 | ||||

1289 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

1290 | ; GFX89-NEXT: v_mov_b32_e32 v0, s5 | ||||

1291 | ; GFX89-NEXT: v_mov_b32_e32 v1, s6 | ||||

1292 | ; GFX89-NEXT: s_lshr_b32 s4, s4, 16 | ||||

1293 | ; GFX89-NEXT: v_mad_u32_u24 v0, s2, v1, v0 | ||||

1294 | ; GFX89-NEXT: v_mov_b32_e32 v2, s3 | ||||

1295 | ; GFX89-NEXT: v_mad_u32_u24 v0, s4, v2, v0 | ||||

1296 | ; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0 | ||||

1297 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

1298 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

1299 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

1300 | ; GFX89-NEXT: s_endpgm | ||||

1301 | ; | ||||

1302 | ; GCN-DL-LABEL: udot2_MultipleUses_mul1: | ||||

1303 | ; GCN-DL: ; %bb.0: ; %entry | ||||

1304 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1305 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1306 | ; GCN-DL-NEXT: s_mov_b32 s2, 0xffff | ||||

1307 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1308 | ; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

1309 | ; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

1310 | ; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

1311 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1312 | ; GCN-DL-NEXT: s_and_b32 s6, s3, s2 | ||||

1313 | ; GCN-DL-NEXT: s_and_b32 s2, s4, s2 | ||||

1314 | ; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16 | ||||

1315 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s5 | ||||

1316 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s6 | ||||

1317 | ; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16 | ||||

1318 | ; GCN-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 | ||||

1319 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s3 | ||||

1320 | ; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 | ||||

1321 | ; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 | ||||

1322 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

1323 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

1324 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

1325 | ; GCN-DL-NEXT: s_endpgm | ||||

1326 | <2 x i16> addrspace(1)* %src2, | ||||

1327 | i32 addrspace(1)* nocapture %dst) { | ||||

1328 | entry: | ||||

1329 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

1330 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

1331 | | ||||

1332 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

1333 | %conv = zext i16 %s1.elt1 to i32 | ||||

1334 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

1335 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

1336 | %mul1 = mul i32 %conv2, %conv | ||||

1337 | | ||||

1338 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

1339 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

1340 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

1341 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

1342 | %mul2 = mul i32 %conv4, %conv3 | ||||

1343 | | ||||

1344 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

1345 | %add0 = add i32 %mul1, %s3 | ||||

1346 | | ||||

1347 | %add1 = add i32 %mul2, %add0 | ||||

1348 | %add2 = add i32 %add1, %mul1 | ||||

1349 | | ||||

1350 | store i32 %add2, i32 addrspace(1)* %dst, align 4 | ||||

1351 | ret void | ||||

1352 | } | ||||

1353 | | ||||

1354 | define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1, | ||||

1355 | ; GFX7-LABEL: idot2_MultipleUses_mul1: | ||||

1356 | ; GFX7: ; %bb.0: ; %entry | ||||

1357 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

1358 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

1359 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

1360 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

1361 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1362 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

1363 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

1364 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

1365 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1366 | ; GFX7-NEXT: s_sext_i32_i16 s7, s4 | ||||

1367 | ; GFX7-NEXT: s_sext_i32_i16 s8, s5 | ||||

1368 | ; GFX7-NEXT: s_ashr_i32 s4, s4, 16 | ||||

1369 | ; GFX7-NEXT: v_mov_b32_e32 v0, s7 | ||||

1370 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

1371 | ; GFX7-NEXT: s_ashr_i32 s5, s5, 16 | ||||

1372 | ; GFX7-NEXT: v_mad_i32_i24 v1, s8, v0, v1 | ||||

1373 | ; GFX7-NEXT: v_mov_b32_e32 v2, s4 | ||||

1374 | ; GFX7-NEXT: v_mad_i32_i24 v1, s5, v2, v1 | ||||

1375 | ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1 | ||||

1376 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

1377 | ; GFX7-NEXT: s_endpgm | ||||

1378 | ; | ||||

1379 | ; GFX89-LABEL: idot2_MultipleUses_mul1: | ||||

1380 | ; GFX89: ; %bb.0: ; %entry | ||||

1381 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1382 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1383 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1384 | ; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

1385 | ; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

1386 | ; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

1387 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

1388 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

1389 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1390 | ; GFX89-NEXT: s_sext_i32_i16 s0, s2 | ||||

1391 | ; GFX89-NEXT: s_sext_i32_i16 s1, s3 | ||||

1392 | ; GFX89-NEXT: s_ashr_i32 s2, s2, 16 | ||||

1393 | ; GFX89-NEXT: v_mov_b32_e32 v2, s4 | ||||

1394 | ; GFX89-NEXT: v_mov_b32_e32 v3, s0 | ||||

1395 | ; GFX89-NEXT: s_ashr_i32 s3, s3, 16 | ||||

1396 | ; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2 | ||||

1397 | ; GFX89-NEXT: v_mov_b32_e32 v4, s2 | ||||

1398 | ; GFX89-NEXT: v_mad_i32_i24 v2, s3, v4, v2 | ||||

1399 | ; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2 | ||||

1400 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

1401 | ; GFX89-NEXT: s_endpgm | ||||

1402 | ; | ||||

1403 | ; GCN-DL-LABEL: idot2_MultipleUses_mul1: | ||||

1404 | ; GCN-DL: ; %bb.0: ; %entry | ||||

1405 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1406 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1407 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1408 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

1409 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

1410 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

1411 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

1412 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

1413 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1414 | ; GCN-DL-NEXT: s_sext_i32_i16 s0, s2 | ||||

1415 | ; GCN-DL-NEXT: s_sext_i32_i16 s1, s3 | ||||

1416 | ; GCN-DL-NEXT: s_ashr_i32 s2, s2, 16 | ||||

1417 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 | ||||

1418 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s0 | ||||

1419 | ; GCN-DL-NEXT: s_ashr_i32 s3, s3, 16 | ||||

1420 | ; GCN-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 | ||||

1421 | ; GCN-DL-NEXT: v_mov_b32_e32 v4, s2 | ||||

1422 | ; GCN-DL-NEXT: v_mad_i32_i24 v2, s3, v4, v2 | ||||

1423 | ; GCN-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 | ||||

1424 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

1425 | ; GCN-DL-NEXT: s_endpgm | ||||

1426 | <2 x i16> addrspace(1)* %src2, | ||||

1427 | i32 addrspace(1)* nocapture %dst) { | ||||

1428 | entry: | ||||

1429 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

1430 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

1431 | | ||||

1432 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

1433 | %conv = sext i16 %s1.elt1 to i32 | ||||

1434 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

1435 | %conv2 = sext i16 %s2.elt1 to i32 | ||||

1436 | %mul1 = mul i32 %conv2, %conv | ||||

1437 | | ||||

1438 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

1439 | %conv3 = sext i16 %s1.elt2 to i32 | ||||

1440 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

1441 | %conv4 = sext i16 %s2.elt2 to i32 | ||||

1442 | %mul2 = mul i32 %conv4, %conv3 | ||||

1443 | | ||||

1444 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

1445 | %add0 = add i32 %mul1, %s3 | ||||

1446 | | ||||

1447 | %add1 = add i32 %mul2, %add0 | ||||

1448 | %add2 = add i32 %add1, %mul1 | ||||

1449 | | ||||

1450 | store i32 %add2, i32 addrspace(1)* %dst, align 4 | ||||

1451 | ret void | ||||

1452 | } | ||||

1453 | | ||||

1454 | define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1, | ||||

1455 | ; GFX7-LABEL: udot2_MultipleUses_mul2: | ||||

1456 | ; GFX7: ; %bb.0: ; %entry | ||||

1457 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

1458 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

1459 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

1460 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

1461 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

1462 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1463 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

1464 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

1465 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

1466 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1467 | ; GFX7-NEXT: s_lshr_b32 s7, s4, 16 | ||||

1468 | ; GFX7-NEXT: s_lshr_b32 s9, s5, 16 | ||||

1469 | ; GFX7-NEXT: v_mov_b32_e32 v0, s7 | ||||

1470 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

1471 | ; GFX7-NEXT: v_mad_u32_u24 v1, s9, v0, v1 | ||||

1472 | ; GFX7-NEXT: s_and_b32 s4, s4, s8 | ||||

1473 | ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1 | ||||

1474 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

1475 | ; GFX7-NEXT: v_mov_b32_e32 v1, s4 | ||||

1476 | ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 | ||||

1477 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

1478 | ; GFX7-NEXT: s_endpgm | ||||

1479 | ; | ||||

1480 | ; GFX89-LABEL: udot2_MultipleUses_mul2: | ||||

1481 | ; GFX89: ; %bb.0: ; %entry | ||||

1482 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1483 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1484 | ; GFX89-NEXT: s_mov_b32 s2, 0xffff | ||||

1485 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1486 | ; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

1487 | ; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

1488 | ; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

1489 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1490 | ; GFX89-NEXT: s_and_b32 s6, s3, s2 | ||||

1491 | ; GFX89-NEXT: s_lshr_b32 s3, s3, 16 | ||||

1492 | ; GFX89-NEXT: s_and_b32 s2, s4, s2 | ||||

1493 | ; GFX89-NEXT: s_lshr_b32 s4, s4, 16 | ||||

1494 | ; GFX89-NEXT: v_mov_b32_e32 v0, s5 | ||||

1495 | ; GFX89-NEXT: v_mov_b32_e32 v1, s3 | ||||

1496 | ; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

1497 | ; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

1498 | ; GFX89-NEXT: v_mov_b32_e32 v1, s6 | ||||

1499 | ; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0 | ||||

1500 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

1501 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

1502 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

1503 | ; GFX89-NEXT: s_endpgm | ||||

1504 | ; | ||||

1505 | ; GCN-DL-LABEL: udot2_MultipleUses_mul2: | ||||

1506 | ; GCN-DL: ; %bb.0: ; %entry | ||||

1507 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1508 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1509 | ; GCN-DL-NEXT: s_mov_b32 s2, 0xffff | ||||

1510 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1511 | ; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0 | ||||

1512 | ; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 | ||||

1513 | ; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 | ||||

1514 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1515 | ; GCN-DL-NEXT: s_and_b32 s6, s3, s2 | ||||

1516 | ; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16 | ||||

1517 | ; GCN-DL-NEXT: s_and_b32 s2, s4, s2 | ||||

1518 | ; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16 | ||||

1519 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s5 | ||||

1520 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s3 | ||||

1521 | ; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

1522 | ; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

1523 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s6 | ||||

1524 | ; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 | ||||

1525 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

1526 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

1527 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

1528 | ; GCN-DL-NEXT: s_endpgm | ||||

1529 | <2 x i16> addrspace(1)* %src2, | ||||

1530 | i32 addrspace(1)* nocapture %dst) { | ||||

1531 | entry: | ||||

1532 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

1533 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

1534 | | ||||

1535 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

1536 | %conv = zext i16 %s1.elt1 to i32 | ||||

1537 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

1538 | %conv2 = zext i16 %s2.elt1 to i32 | ||||

1539 | %mul1 = mul i32 %conv2, %conv | ||||

1540 | | ||||

1541 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

1542 | %conv3 = zext i16 %s1.elt2 to i32 | ||||

1543 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

1544 | %conv4 = zext i16 %s2.elt2 to i32 | ||||

1545 | %mul2 = mul i32 %conv4, %conv3 | ||||

1546 | | ||||

1547 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

1548 | %add0 = add i32 %mul2, %s3 | ||||

1549 | | ||||

1550 | %add1 = add i32 %mul2, %add0 | ||||

1551 | %add2 = add i32 %add1, %mul1 | ||||

1552 | | ||||

1553 | store i32 %add2, i32 addrspace(1)* %dst, align 4 | ||||

1554 | ret void | ||||

1555 | } | ||||

1556 | | ||||

1557 | define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1, | ||||

1558 | ; GFX7-LABEL: idot2_MultipleUses_mul2: | ||||

1559 | ; GFX7: ; %bb.0: ; %entry | ||||

1560 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

1561 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

1562 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

1563 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

1564 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1565 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

1566 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

1567 | ; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 | ||||

1568 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1569 | ; GFX7-NEXT: s_sext_i32_i16 s7, s4 | ||||

1570 | ; GFX7-NEXT: s_ashr_i32 s4, s4, 16 | ||||

1571 | ; GFX7-NEXT: s_sext_i32_i16 s8, s5 | ||||

1572 | ; GFX7-NEXT: s_ashr_i32 s5, s5, 16 | ||||

1573 | ; GFX7-NEXT: v_mov_b32_e32 v0, s4 | ||||

1574 | ; GFX7-NEXT: v_mov_b32_e32 v1, s6 | ||||

1575 | ; GFX7-NEXT: v_mad_i32_i24 v1, s5, v0, v1 | ||||

1576 | ; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1 | ||||

1577 | ; GFX7-NEXT: v_mov_b32_e32 v1, s7 | ||||

1578 | ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 | ||||

1579 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

1580 | ; GFX7-NEXT: s_endpgm | ||||

1581 | ; | ||||

1582 | ; GFX89-LABEL: idot2_MultipleUses_mul2: | ||||

1583 | ; GFX89: ; %bb.0: ; %entry | ||||

1584 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1585 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1586 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1587 | ; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

1588 | ; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

1589 | ; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

1590 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

1591 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

1592 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1593 | ; GFX89-NEXT: s_sext_i32_i16 s0, s2 | ||||

1594 | ; GFX89-NEXT: s_ashr_i32 s2, s2, 16 | ||||

1595 | ; GFX89-NEXT: s_sext_i32_i16 s1, s3 | ||||

1596 | ; GFX89-NEXT: s_ashr_i32 s3, s3, 16 | ||||

1597 | ; GFX89-NEXT: v_mov_b32_e32 v2, s4 | ||||

1598 | ; GFX89-NEXT: v_mov_b32_e32 v3, s2 | ||||

1599 | ; GFX89-NEXT: v_mad_i32_i24 v2, s3, v3, v2 | ||||

1600 | ; GFX89-NEXT: v_mad_i32_i24 v2, s3, v3, v2 | ||||

1601 | ; GFX89-NEXT: v_mov_b32_e32 v3, s0 | ||||

1602 | ; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2 | ||||

1603 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

1604 | ; GFX89-NEXT: s_endpgm | ||||

1605 | ; | ||||

1606 | ; GCN-DL-LABEL: idot2_MultipleUses_mul2: | ||||

1607 | ; GCN-DL: ; %bb.0: ; %entry | ||||

1608 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1609 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1610 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1611 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

1612 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

1613 | ; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

1614 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

1615 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

1616 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1617 | ; GCN-DL-NEXT: s_sext_i32_i16 s0, s2 | ||||

1618 | ; GCN-DL-NEXT: s_ashr_i32 s2, s2, 16 | ||||

1619 | ; GCN-DL-NEXT: s_sext_i32_i16 s1, s3 | ||||

1620 | ; GCN-DL-NEXT: s_ashr_i32 s3, s3, 16 | ||||

1621 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 | ||||

1622 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s2 | ||||

1623 | ; GCN-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 | ||||

1624 | ; GCN-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 | ||||

1625 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s0 | ||||

1626 | ; GCN-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 | ||||

1627 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

1628 | ; GCN-DL-NEXT: s_endpgm | ||||

1629 | <2 x i16> addrspace(1)* %src2, | ||||

1630 | i32 addrspace(1)* nocapture %dst) { | ||||

1631 | entry: | ||||

1632 | %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

1633 | %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

1634 | | ||||

1635 | %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 | ||||

1636 | %conv = sext i16 %s1.elt1 to i32 | ||||

1637 | %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 | ||||

1638 | %conv2 = sext i16 %s2.elt1 to i32 | ||||

1639 | %mul1 = mul i32 %conv2, %conv | ||||

1640 | | ||||

1641 | %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 | ||||

1642 | %conv3 = sext i16 %s1.elt2 to i32 | ||||

1643 | %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 | ||||

1644 | %conv4 = sext i16 %s2.elt2 to i32 | ||||

1645 | %mul2 = mul i32 %conv4, %conv3 | ||||

1646 | | ||||

1647 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

1648 | %add0 = add i32 %mul2, %s3 | ||||

1649 | | ||||

1650 | %add1 = add i32 %mul2, %add0 | ||||

1651 | %add2 = add i32 %add1, %mul1 | ||||

1652 | | ||||

1653 | store i32 %add2, i32 addrspace(1)* %dst, align 4 | ||||

1654 | ret void | ||||

1655 | } | ||||

1656 | | ||||

1657 | define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, | ||||

1658 | ; GFX7-LABEL: udot2_acc16: | ||||

1659 | ; GFX7: ; %bb.0: ; %entry | ||||

1660 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

1661 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

1662 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

1663 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

1664 | ; GFX7-NEXT: s_mov_b32 s8, 0xffff | ||||

1665 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1666 | ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 | ||||

1667 | ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 | ||||

1668 | ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 | ||||

1669 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1670 | ; GFX7-NEXT: s_lshr_b32 s6, s4, 16 | ||||

1671 | ; GFX7-NEXT: s_and_b32 s4, s4, s8 | ||||

1672 | ; GFX7-NEXT: s_lshr_b32 s7, s5, 16 | ||||

1673 | ; GFX7-NEXT: v_mov_b32_e32 v1, s7 | ||||

1674 | ; GFX7-NEXT: s_and_b32 s5, s5, s8 | ||||

1675 | ; GFX7-NEXT: s_waitcnt vmcnt(0) | ||||

1676 | ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 | ||||

1677 | ; GFX7-NEXT: v_mov_b32_e32 v1, s5 | ||||

1678 | ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 | ||||

1679 | ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 | ||||

1680 | ; GFX7-NEXT: s_endpgm | ||||

1681 | ; | ||||

1682 | ; GFX89-LABEL: udot2_acc16: | ||||

1683 | ; GFX89: ; %bb.0: ; %entry | ||||

1684 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1685 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1686 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1687 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

1688 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

1689 | ; GFX89-NEXT: {{flat|global}}_load_ushort v2, v[0:1] | ||||

1690 | ; GFX89-NEXT: s_load_dword s1, s[4:5], 0x0 | ||||

1691 | ; GFX89-NEXT: s_load_dword s2, s[6:7], 0x0 | ||||

1692 | ; GFX89-NEXT: s_mov_b32 s0, 0xffff | ||||

1693 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1694 | ; GFX89-NEXT: s_and_b32 s3, s1, s0 | ||||

1695 | ; GFX89-NEXT: s_and_b32 s0, s2, s0 | ||||

1696 | ; GFX89-NEXT: s_lshr_b32 s2, s2, 16 | ||||

1697 | ; GFX89-NEXT: s_lshr_b32 s1, s1, 16 | ||||

1698 | ; GFX89-NEXT: v_mov_b32_e32 v3, s2 | ||||

1699 | ; GFX89-NEXT: s_waitcnt vmcnt(0) | ||||

1700 | ; GFX89-NEXT: v_mad_u32_u24 v2, s1, v3, v2 | ||||

1701 | ; GFX89-NEXT: v_mov_b32_e32 v3, s0 | ||||

1702 | ; GFX89-NEXT: v_mad_u32_u24 v2, s3, v3, v2 | ||||

1703 | ; GFX89-NEXT: {{flat|global}}_store_short v[0:1], v2 | ||||

1704 | ; GFX89-NEXT: s_endpgm | ||||

1705 | ; | ||||

1706 | ; GCN-DL-LABEL: udot2_acc16: | ||||

1707 | ; GCN-DL: ; %bb.0: ; %entry | ||||

1708 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1709 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1710 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1711 | ; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 | ||||

1712 | ; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0 | ||||

1713 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

1714 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

1715 | ; GCN-DL-NEXT: global_load_ushort v2, v[0:1], off | ||||

1716 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1717 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s3 | ||||

1718 | ; GCN-DL-NEXT: s_waitcnt vmcnt(0) | ||||

1719 | ; GCN-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2 | ||||

1720 | ; GCN-DL-NEXT: global_store_short v[0:1], v2, off | ||||

1721 | ; GCN-DL-NEXT: s_endpgm | ||||

1722 | <2 x i16> addrspace(1)* %src2, | ||||

1723 | i16 addrspace(1)* nocapture %dst) { | ||||

1724 | entry: | ||||

1725 | %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1 | ||||

1726 | %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2 | ||||

1727 | | ||||

1728 | %v1e1 = extractelement <2 x i16> %v1, i64 0 | ||||

1729 | %v2e1 = extractelement <2 x i16> %v2, i64 0 | ||||

1730 | %mul1 = mul i16 %v1e1, %v2e1 | ||||

1731 | | ||||

1732 | %v1e2 = extractelement <2 x i16> %v1, i64 1 | ||||

1733 | %v2e2 = extractelement <2 x i16> %v2, i64 1 | ||||

1734 | %mul2 = mul i16 %v1e2, %v2e2 | ||||

1735 | | ||||

1736 | %s2 = load i16, i16 addrspace(1)* %dst, align 2 | ||||

1737 | %add1 = add i16 %mul2, %s2 | ||||

1738 | %add2 = add i16 %add1, %mul1 | ||||

1739 | store i16 %add2, i16 addrspace(1)* %dst, align 2 | ||||

1740 | ret void | ||||

1741 | } | ||||

1742 | | ||||

1743 | | ||||

1744 | define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, | ||||

1745 | ; GFX7-LABEL: notsdot2_sext8: | ||||

1746 | ; GFX7: ; %bb.0: ; %entry | ||||

1747 | ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | ||||

1748 | ; GFX7-NEXT: s_mov_b32 s3, 0xf000 | ||||

1749 | ; GFX7-NEXT: s_mov_b32 s2, -1 | ||||

1750 | ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd | ||||

1751 | ; GFX7-NEXT: s_mov_b32 s10, s2 | ||||

1752 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1753 | ; GFX7-NEXT: s_mov_b32 s8, s6 | ||||

1754 | ; GFX7-NEXT: s_mov_b32 s9, s7 | ||||

1755 | ; GFX7-NEXT: s_mov_b32 s11, s3 | ||||

1756 | ; GFX7-NEXT: s_mov_b32 s6, s2 | ||||

1757 | ; GFX7-NEXT: s_mov_b32 s7, s3 | ||||

1758 | ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 | ||||

1759 | ; GFX7-NEXT: buffer_load_ushort v1, off, s[8:11], 0 | ||||

1760 | ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 | ||||

1761 | ; GFX7-NEXT: s_waitcnt vmcnt(1) | ||||

1762 | ; GFX7-NEXT: v_bfe_i32 v2, v0, 0, 8 | ||||

1763 | ; GFX7-NEXT: s_waitcnt vmcnt(0) | ||||

1764 | ; GFX7-NEXT: v_bfe_i32 v3, v1, 0, 8 | ||||

1765 | ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 | ||||

1766 | ; GFX7-NEXT: v_bfe_i32 v1, v1, 8, 8 | ||||

1767 | ; GFX7-NEXT: s_waitcnt lgkmcnt(0) | ||||

1768 | ; GFX7-NEXT: v_mad_i32_i24 v0, v1, v0, s4 | ||||

1769 | ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v2, v0 | ||||

1770 | ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 | ||||

1771 | ; GFX7-NEXT: s_endpgm | ||||

1772 | ; | ||||

1773 | ; GFX89-LABEL: notsdot2_sext8: | ||||

1774 | ; GFX89: ; %bb.0: ; %entry | ||||

1775 | ; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1776 | ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1777 | ; GFX89-NEXT: s_waitcnt lgkmcnt(0) | ||||

1778 | ; GFX89-NEXT: s_load_dword s2, s[0:1], 0x0 | ||||

1779 | ; GFX89-NEXT: v_mov_b32_e32 v0, s6 | ||||

1780 | ; GFX89-NEXT: v_mov_b32_e32 v1, s7 | ||||

1781 | ; GFX89-NEXT: v_mov_b32_e32 v2, s4 | ||||

1782 | ; GFX89-NEXT: v_mov_b32_e32 v3, s5 | ||||

1783 | ; GFX89-NEXT: {{flat|global}}_load_ushort v2, v[2:3] | ||||

1784 | ; GFX89-NEXT: {{flat|global}}_load_ushort v3, v[0:1] | ||||

1785 | ; GFX89-NEXT: v_mov_b32_e32 v0, s0 | ||||

1786 | ; GFX89-NEXT: v_mov_b32_e32 v1, s1 | ||||

1787 | ; GFX89-NEXT: s_waitcnt vmcnt(1) | ||||

1788 | ; GFX89-NEXT: v_lshrrev_b16_e32 v4, 8, v2 | ||||

1789 | ; GFX89-NEXT: s_waitcnt vmcnt(0) | ||||

1790 | ; GFX89-NEXT: v_bfe_i32 v5, v3, 0, 8 | ||||

1791 | ; GFX89-NEXT: v_lshrrev_b16_e32 v3, 8, v3 | ||||

1792 | ; GFX89-NEXT: v_bfe_i32 v4, v4, 0, 8 | ||||

1793 | ; GFX89-NEXT: v_bfe_i32 v3, v3, 0, 8 | ||||

1794 | ; GFX89-NEXT: v_bfe_i32 v2, v2, 0, 8 | ||||

1795 | ; GFX89: v_mad_i32_i24 v3, v3, v4, s2 | ||||

1796 | ; GFX89: v_mad_i32_i24 v2, v5, v2, v3 | ||||

1797 | ; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2 | ||||

1798 | ; GFX89-NEXT: s_endpgm | ||||

1799 | ; | ||||

1800 | ; GCN-DL-LABEL: notsdot2_sext8: | ||||

1801 | ; GCN-DL: ; %bb.0: ; %entry | ||||

1802 | ; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | ||||

1803 | ; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 | ||||

1804 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1805 | ; GCN-DL-NEXT: s_load_dword s2, s[0:1], 0x0 | ||||

1806 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s6 | ||||

1807 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s7 | ||||

1808 | ; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 | ||||

1809 | ; GCN-DL-NEXT: v_mov_b32_e32 v3, s5 | ||||

1810 | ; GCN-DL-NEXT: global_load_ushort v2, v[2:3], off | ||||

1811 | ; GCN-DL-NEXT: global_load_ushort v3, v[0:1], off | ||||

1812 | ; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 | ||||

1813 | ; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 | ||||

1814 | ; GCN-DL-NEXT: s_waitcnt vmcnt(1) | ||||

1815 | ; GCN-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 | ||||

1816 | ; GCN-DL-NEXT: s_waitcnt vmcnt(0) | ||||

1817 | ; GCN-DL-NEXT: v_bfe_i32 v5, v3, 0, 8 | ||||

1818 | ; GCN-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v3 | ||||

1819 | ; GCN-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 | ||||

1820 | ; GCN-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 | ||||

1821 | ; GCN-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 | ||||

1822 | ; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) | ||||

1823 | ; GCN-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 | ||||

1824 | ; GCN-DL-NEXT: v_mad_i32_i24 v2, v5, v2, v3 | ||||

1825 | ; GCN-DL-NEXT: global_store_dword v[0:1], v2, off | ||||

1826 | ; GCN-DL-NEXT: s_endpgm | ||||

1827 | <2 x i8> addrspace(1)* %src2, | ||||

1828 | i32 addrspace(1)* nocapture %dst) { | ||||

1829 | entry: | ||||

1830 | %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %src1 | ||||

1831 | %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %src2 | ||||

1832 | | ||||

1833 | %s1.elt1 = extractelement <2 x i8> %vec1, i64 0 | ||||

1834 | %conv = sext i8 %s1.elt1 to i32 | ||||

1835 | %s2.elt1 = extractelement <2 x i8> %vec2, i64 0 | ||||

1836 | %conv2 = sext i8 %s2.elt1 to i32 | ||||

1837 | %mul1 = mul nuw i32 %conv2, %conv | ||||

1838 | | ||||

1839 | %s1.elt2 = extractelement <2 x i8> %vec1, i64 1 | ||||

1840 | %conv3 = sext i8 %s1.elt2 to i32 | ||||

1841 | %s2.elt2 = extractelement <2 x i8> %vec2, i64 1 | ||||

1842 | %conv4 = sext i8 %s2.elt2 to i32 | ||||

1843 | %mul2 = mul nuw i32 %conv4, %conv3 | ||||

1844 | | ||||

1845 | %s3 = load i32, i32 addrspace(1)* %dst, align 4 | ||||

1846 | %add = add i32 %mul2, %s3 | ||||

1847 | %add6 = add i32 %add, %mul1 | ||||

1848 | store i32 %add6, i32 addrspace(1)* %dst, align 4 | ||||

1849 | ret void | ||||

1850 | } |