DLTcollab · howjmay · Dec 27, 2020
@@ -3630,6 +3630,21 @@ FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
 #endif
 }
 
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// add the intermediate result to packed elements in c, and store the results in
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pd
+FORCE_INLINE __m128d _mm_fmadd_pd(__m128d a, __m128d b, __m128d c)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(c),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(a)));
+#else
+    return _mm_add_pd(_mm_mul_pd(a, b), c);
+#endif
+}
+
 // Alternatively add and subtract packed single-precision (32-bit)
 // floating-point elements in a to/from packed elements in b, and store the
 // results in dst.

@@ -345,4 +345,14 @@ result_t validateDouble(__m128d a, double d0, double d1)
     return TEST_SUCCESS;
 }
 
+result_t validateDoubleEpsilon(__m128d a, double f0, double f1, double epsilon)
+{
+    const double *t = (const double *) &a;
+    double df0 = fabs(t[0] - f0);
+    double df1 = fabs(t[1] - f1);
+    ASSERT_RETURN(df0 < epsilon);
+    ASSERT_RETURN(df1 < epsilon);
+    return TEST_SUCCESS;
+}
+
 }  // namespace SSE2NEON
@@ -181,6 +181,7 @@ result_t validateFloatError(__m128 a,
                             float f3,
                             float err);
 result_t validateDouble(__m128d a, double d0, double d1);
+result_t validateDoubleEpsilon(__m128d a, double d0, double d1, double epsilon);
 }  // namespace SSE2NEON
 
 #endif
@@ -7404,6 +7404,26 @@ result_t test_mm_fmadd_ps(const SSE2NEONTestImpl &impl, uint32_t i)
     return TEST_UNIMPL;
 }
 
+result_t test_mm_fmadd_pd(const SSE2NEONTestImpl &impl, uint32_t i)
+{
+    const double _a[2]{(double) impl.mTestFloats[0],
+                       (double) impl.mTestFloats[1]};
+    const double _b[2]{(double) impl.mTestFloats[2],
+                       (double) impl.mTestFloats[3]};
+    const double _c[2]{(double) impl.mTestFloats[4],
+                       (double) impl.mTestFloats[5]};
+
+    __m128d a = _mm_load_pd(_a);
+    __m128d b = _mm_load_pd(_b);
+    __m128d c = _mm_load_pd(_c);
+
+    double f0 = _a[0] * _b[0] + _c[0];
+    double f1 = _a[1] * _b[1] + _c[1];
+
+    __m128d d = _mm_fmadd_pd(a, b, c);
+    return validateDoubleEpsilon(d, f0, f1, 0.0001f);
+}
+
 /* Others */
 result_t test_mm_clmulepi64_si128(const SSE2NEONTestImpl &impl, uint32_t i)
 {

@@ -527,6 +527,7 @@
     TYPE(mm_aeskeygenassist_si128)    \
     /* FMA */                         \
     TYPE(mm_fmadd_ps)                 \
+    TYPE(mm_fmadd_pd)                 \
     /* Others */                      \
     TYPE(mm_clmulepi64_si128)         \
     TYPE(mm_popcnt_u32)               \