IA-32 ArchitectureManual Processor Dispatch: Example

This example shows multiple optimized functions in a single binary file. You can use manual processor dispatch to support up to seven different versions of any one function, targeting up to seven different processors.

#include <mmintrin.h>


/* Pentium processor function does not use intrinsics to add two arrays. */
__declspec(cpu_specific(pentium))
void array_sum(int *r, int *a, int *b,size_t l)
{
   for (; length > 0; l--)   *result++ = *a++ + *b++;
}
/* Implementation for a Pentium processor with MMX technology uses an MMX instruction intrinsic to add four elements simultaneously. */
__declspec(cpu_specific(pentium_MMX)) void array_sum(int *r,int const *a, int *b, size_t l) {
   __m64 *mmx_result = (__m64 *)result;   __m64 const *mmx_a = (__m64 const *)a;   __m64 const *mmx_b = (__m64 const *)b;
   for (; length > 3; length -= 4)   *mmx_result++ = _mm_add_pi16(*mmx_a++, *mmx_b++);
   /* The following code, which takes care of excess elements, is not   needed if the array sizes passed are known to be multiples of four. */
   result = (unsigned short *)mmx_r;   a = (unsigned short const *)mmx_a;   b = (unsigned short const *)mmx_b;    for (; length > 0; l--)   *result++ = *a++ + *b++;
} /* The function stub informs the compiler to generate the   CPU-dispatch function listed in the cpu_dispatch clause. */ __declspec(cpu_dispatch(pentium, pentium_MMX)) void array_sum (int *r,int const *a, int *b, size_t l) )
{
   /* The function body is empty. */
}