網路黑貓 BlackCat on Net / Champ Yen: The Power of Prefetching

這篇其實比"你也可以寫 SIMD 比網頁還快"早很多, 其中 I 的部分就是與這篇的 function 比較速度
====

先前在臉書上有分享過 “When Prefetching Works, When It Doesn’t, and Why”這篇 paper, 現在要用實例顯示 prefetching 帶來的效益, 今日使用的範例是 Matrix Transpose

一個簡單的 C 版本實作如下

void naive_transpose(int *src, int *dst, int w, int h){

    for(int x = 0; x < w; x++){

        for(int y = 0; y < h; y++){

             *(dst + x*h + y) = *(src + y*w + x);

SSE2 實作版本如下, 由於先進處理器架構有著 automatic/speculative prefetcher 所以這裡我們故意寫得比較 CPU-unfriendly 一些

void sse_transpose(int *src, int *dst, int w, int h){

    for(int x = 0; x < w; x+=4){

        for(int y = 0; y < h; y+=4){

            __m128i I0 = _mm_loadu_si128 ((__m128i*)(src+y*w+x));

            __m128i I1 = _mm_loadu_si128 ((__m128i*)(src+(y+1)*w+x));

            __m128i I2 = _mm_loadu_si128 ((__m128i*)(src+(y+2)*w+x));

            __m128i I3 = _mm_loadu_si128 ((__m128i*)(src+(y+3)*w+x));

            __m128i T0 = _mm_unpacklo_epi32(I0, I1);

            __m128i T1 = _mm_unpacklo_epi32(I2, I3);

            __m128i T2 = _mm_unpackhi_epi32(I0, I1);

            __m128i T3 = _mm_unpackhi_epi32(I2, I3);

            I0 = _mm_unpacklo_epi64(T0, T1);

            I1 = _mm_unpackhi_epi64(T0, T1);

            I2 = _mm_unpacklo_epi64(T2, T3);

            I3 = _mm_unpackhi_epi64(T2, T3);

            _mm_storeu_si128((__m128i*)(dst+(x*h)+y), I0);

            _mm_storeu_si128((__m128i*)(dst+((x+1)*h)+y), I1);

            _mm_storeu_si128((__m128i*)(dst+((x+2)*h)+y), I2);

            _mm_storeu_si128((__m128i*)(dst+((x+3)*h)+y), I3);

接著我們來撰寫一個使用 SSE prefetch 指令的加速版本

void sse_prefetch_transpose(int *src, int *dst, int w, int h){

    for(int x = 0; x < w; x+=4){

        for(int y = 0; y < h; y+=4){

            #define PFDIST  8
            _mm_prefetch(src+(y+PFDIST)*w+x, _MM_HINT_T1);

            _mm_prefetch(src+(y+PFDIST+1)*w+x, _MM_HINT_T1);

            _mm_prefetch(src+(y+PFDIST+2)*w+x, _MM_HINT_T1);

            _mm_prefetch(src+(y+PFDIST+3)*w+x, _MM_HINT_T1);

 
            __m128i I0 = _mm_loadu_si128 ((__m128i*)(src+y*w+x));

            __m128i I1 = _mm_loadu_si128 ((__m128i*)(src+(y+1)*w+x));

            __m128i I2 = _mm_loadu_si128 ((__m128i*)(src+(y+2)*w+x));

            __m128i I3 = _mm_loadu_si128 ((__m128i*)(src+(y+3)*w+x));

            __m128i T0 = _mm_unpacklo_epi32(I0, I1);

            __m128i T1 = _mm_unpacklo_epi32(I2, I3);

            __m128i T2 = _mm_unpackhi_epi32(I0, I1);

            __m128i T3 = _mm_unpackhi_epi32(I2, I3);

            I0 = _mm_unpacklo_epi64(T0, T1);

            I1 = _mm_unpackhi_epi64(T0, T1);

            I2 = _mm_unpacklo_epi64(T2, T3);

            I3 = _mm_unpackhi_epi64(T2, T3);

            _mm_storeu_si128((__m128i*)(dst+(x*h)+y), I0);

            _mm_storeu_si128((__m128i*)(dst+((x+1)*h)+y), I1);

            _mm_storeu_si128((__m128i*)(dst+((x+2)*h)+y), I2);

            _mm_storeu_si128((__m128i*)(dst+((x+3)*h)+y), I3);

接著是測試程式(請將上述程式碼置於指定位置)

#include

#include

#include

#include 
//for using x86-SSE2 intrinsics you have to include this

#include

// PUT naive_transpose, sse_transpose, sse_prefetch_transpose HERE

// ...

#define TEST_W 4096

#define TEST_H 4096
int main(void){

    //verify the result of 4x4 matrix

        int testin[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };

        int testout[16];
        for(int y = 0; y < 4; y++){

            for(int x = 0; x < 4; x++)

                printf(" %2d", testin[y*4+x]);

            printf("\n");

        printf("\n");

        sse_transpose(testin, testout, 4, 4);

        for(int y = 0; y < 4; y++){

            for(int x = 0; x < 4; x++)

                printf(" %2d", testout[y*4+x]);

            printf("\n");

        struct timeval stime, etime;

        int *src = (int*)malloc(sizeof(int)*TEST_W*TEST_H);

        int *out0 = (int*)malloc(sizeof(int)*TEST_W*TEST_H);

        int *out1 = (int*)malloc(sizeof(int)*TEST_W*TEST_H);

        int *out2 = (int*)malloc(sizeof(int)*TEST_W*TEST_H);


        srand(time(NULL));

        for(int y = 0; y < TEST_H; y++){

            for(int x = 0; x < TEST_W; x++)

                *(src + y*TEST_W + x) = rand();

        }
        gettimeofday(&stime, NULL);

        sse_prefetch_transpose(src, out0, TEST_W, TEST_H);

        gettimeofday(&etime, NULL);

        printf("sse prefetch: %ld us\n", (etime.tv_sec - stime.tv_sec)*1000000 + (etime.tv_usec - stime.tv_usec));

        gettimeofday(&stime, NULL);

        sse_transpose(src, out1, TEST_W, TEST_H);

        gettimeofday(&etime, NULL);

        printf("sse: %ld us\n", (etime.tv_sec - stime.tv_sec)*1000000 + (etime.tv_usec - stime.tv_usec));

        gettimeofday(&stime, NULL);

        naive_transpose(src, out2, TEST_W, TEST_H);

        gettimeofday(&etime, NULL);

        printf("naive: %ld us\n", (etime.tv_sec - stime.tv_sec)*1000000 + (etime.tv_usec - stime.tv_usec));

        free(src);

        free(out0);

        free(out1);

        free(out2);

     }
     return 0;

存成 test.c, 並且按照下列方式編譯

gcc -msse2 -o test test.c

於個人的 Core i7 3612QM 執行得到下列結果

  0  1  2  3

  4  5  6  7

  8  9 10 11

 12 13 14 15

  0  4  8 12

  1  5  9 13

  2  6 10 14

  3  7 11 15

sse prefetch:  57782 us

sse: 117184 us

naive: 228870 us

儘管現今 CPU 有著強大的 auto-prefetcher, 但是並不是所有的演算都有著規律或是線型的記憶體存取模式, 如此透過優化 prefetching 依然是能夠有相當的效能增進.

網路黑貓 BlackCat on Net / Champ Yen