Index: include/llvm/IR/PatternMatch.h
===================================================================
--- include/llvm/IR/PatternMatch.h
+++ include/llvm/IR/PatternMatch.h
@@ -1043,6 +1043,18 @@
   return CastClass_match<OpTy, Instruction::SIToFP>(Op);
 }
 
+/// Matches FPToSI.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::FPToSI> m_FPToSI(const OpTy &Op) {
+  return CastClass_match<OpTy, Instruction::FPToSI>(Op);
+}
+
+/// Matches FPToUI.
+template <typename OpTy>
+inline CastClass_match<OpTy, Instruction::FPToUI> m_FPToUI(const OpTy &Op) {
+  return CastClass_match<OpTy, Instruction::FPToUI>(Op);
+}
+
 /// Matches FPTrunc
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::FPTrunc> m_FPTrunc(const OpTy &Op) {
Index: lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1728,10 +1728,28 @@
 }
 
 Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+  Value *X;
+  if (match(CI.getOperand(0), m_FPToUI(m_Value(X))) &&
+      X->getType() == CI.getType()) {
+    // fptoui rounds towards zero, so this is the same as libm 'trunc':
+    // uitofp (fptoui X) --> llvm.trunc(X)
+    Value *Trunc = Builder.CreateIntrinsic(Intrinsic::trunc, { X }, &CI);
+    return replaceInstUsesWith(CI, Trunc);
+  }
+
   return commonCastTransforms(CI);
 }
 
 Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+  Value *X;
+  if (match(CI.getOperand(0), m_FPToSI(m_Value(X))) &&
+      X->getType() == CI.getType()) {
+    // fptosi rounds towards zero, so this is the same as libm 'trunc':
+    // sitofp (fptosi X) --> llvm.trunc(X)
+    Value *Trunc = Builder.CreateIntrinsic(Intrinsic::trunc, { X }, &CI);
+    return replaceInstUsesWith(CI, Trunc);
+  }
+
   return commonCastTransforms(CI);
 }
 
Index: test/CodeGen/AMDGPU/simplify-libcalls.ll
===================================================================
--- test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -389,8 +389,8 @@
 ; GCN-PRELINK: %conv = fptosi float %tmp1 to i32
 ; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
 ; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
-; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float
-; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
+; GCN-PRELINK: %0 = tail call float @llvm.trunc.f32(float %tmp1)
+; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %0
 ; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
 ; GCN-PRELINK: %__yeven = shl i32 %conv, 31
 ; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
Index: test/Transforms/InstCombine/sitofp.ll
===================================================================
--- test/Transforms/InstCombine/sitofp.ll
+++ test/Transforms/InstCombine/sitofp.ll
@@ -216,3 +216,72 @@
   ret i55 %C
 }
 
+; Casting to integer and back to the same type with rounding towards zero is llvm.trunc().
+; PR36617: https://bugs.llvm.org/show_bug.cgi?id=36617
+
+define float @trunc_signed_f32(float %x) {
+; CHECK-LABEL: @trunc_signed_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.trunc.f32(float [[X:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %i = fptosi float %x to i32
+  %r = sitofp i32 %i to float
+  ret float %r
+}
+
+; The intermediate type does not matter. If the first cast is out-of-range, that's UB.
+
+define double @trunc_signed_f64(double %x) {
+; CHECK-LABEL: @trunc_signed_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.trunc.f64(double [[X:%.*]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %i = fptosi double %x to i8
+  %r = sitofp i8 %i to double
+  ret double %r
+}
+
+; Vector types work too.
+
+define <2 x half> @trunc_signed_v2f16(<2 x half> %x) {
+; CHECK-LABEL: @trunc_signed_v2f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.trunc.v2f16(<2 x half> [[X:%.*]])
+; CHECK-NEXT:    ret <2 x half> [[TMP1]]
+;
+  %i = fptosi <2 x half> %x to <2 x i32>
+  %r = sitofp <2 x i32> %i to <2 x half>
+  ret <2 x half> %r
+}
+
+; Casting to unsigned integer and back to the same type with rounding towards zero is still llvm.trunc().
+
+define <2 x float> @trunc_unsigned_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @trunc_unsigned_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[X:%.*]])
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
+;
+  %i = fptoui <2 x float> %x to <2 x i232>
+  %r = uitofp <2 x i232> %i to <2 x float>
+  ret <2 x float> %r
+}
+
+define fp128 @trunc_unsigned_f128(fp128 %x) {
+; CHECK-LABEL: @trunc_unsigned_f128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fp128 @llvm.trunc.f128(fp128 [[X:%.*]])
+; CHECK-NEXT:    ret fp128 [[TMP1]]
+;
+  %i = fptoui fp128 %x to i128
+  %r = uitofp i128 %i to fp128
+  ret fp128 %r
+}
+
+define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) {
+; CHECK-LABEL: @trunc_unsigned_v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[X:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %i = fptoui <2 x double> %x to <2 x i16>
+  %r = uitofp <2 x i16> %i to <2 x double>
+  ret <2 x double> %r
+}
+
Index: test/Transforms/InstCombine/vector-casts.ll
===================================================================
--- test/Transforms/InstCombine/vector-casts.ll
+++ test/Transforms/InstCombine/vector-casts.ll
@@ -193,9 +193,8 @@
 
 define <2 x double> @fb(<2 x double> %t) {
 ; CHECK-LABEL: @fb(
-; CHECK-NEXT:    [[A:%.*]] = fptoui <2 x double> %t to <2 x i64>
-; CHECK-NEXT:    [[B:%.*]] = uitofp <2 x i64> [[A]] to <2 x double>
-; CHECK-NEXT:    ret <2 x double> [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[T:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a = fptoui <2 x double> %t to <2 x i64>
   %b = uitofp <2 x i64> %a to <2 x double>
@@ -204,9 +203,8 @@
 
 define <2 x double> @fc(<2 x double> %t) {
 ; CHECK-LABEL: @fc(
-; CHECK-NEXT:    [[A:%.*]] = fptosi <2 x double> %t to <2 x i64>
-; CHECK-NEXT:    [[B:%.*]] = sitofp <2 x i64> [[A]] to <2 x double>
-; CHECK-NEXT:    ret <2 x double> [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[T:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a = fptosi <2 x double> %t to <2 x i64>
   %b = sitofp <2 x i64> %a to <2 x double>