這篇其實比"你也可以寫 SIMD 比網頁還快"早很多, 其中 I 的部分就是與這篇的 function 比較速度
====
====
先前在臉書上有分享過 “When Prefetching Works, When It Doesn’t, and Why”這篇 paper, 現在要用實例顯示 prefetching 帶來的效益, 今日使用的範例是 Matrix Transpose
一個簡單的 C 版本實作如下
void naive_transpose(int *src, int *dst, int w, int h){for(int x = 0; x < w; x++){for(int y = 0; y < h; y++){*(dst + x*h + y) = *(src + y*w + x);}}
}
SSE2 實作版本如下, 由於先進處理器架構有著 automatic/speculative prefetcher 所以這裡我們故意寫得比較 CPU-unfriendly 一些
void sse_transpose(int *src, int *dst, int w, int h){for(int x = 0; x < w; x+=4){
for(int y = 0; y < h; y+=4){
__m128i I0 = _mm_loadu_si128 ((__m128i*)(src+y*w+x));__m128i I1 = _mm_loadu_si128 ((__m128i*)(src+(y+1)*w+x));
__m128i I2 = _mm_loadu_si128 ((__m128i*)(src+(y+2)*w+x));__m128i I3 = _mm_loadu_si128 ((__m128i*)(src+(y+3)*w+x));
__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);I0 = _mm_unpacklo_epi64(T0, T1);I1 = _mm_unpackhi_epi64(T0, T1);I2 = _mm_unpacklo_epi64(T2, T3);I3 = _mm_unpackhi_epi64(T2, T3);_mm_storeu_si128((__m128i*)(dst+(x*h)+y), I0);_mm_storeu_si128((__m128i*)(dst+((x+1)*h)+y), I1);_mm_storeu_si128((__m128i*)(dst+((x+2)*h)+y), I2);_mm_storeu_si128((__m128i*)(dst+((x+3)*h)+y), I3);
}}
}
接著我們來撰寫一個使用 SSE prefetch 指令的加速版本
void sse_prefetch_transpose(int *src, int *dst, int w, int h){for(int x = 0; x < w; x+=4){
for(int y = 0; y < h; y+=4){#define PFDIST 8 _mm_prefetch(src+(y+PFDIST)*w+x, _MM_HINT_T1);
_mm_prefetch(src+(y+PFDIST+1)*w+x, _MM_HINT_T1);
_mm_prefetch(src+(y+PFDIST+2)*w+x, _MM_HINT_T1);
_mm_prefetch(src+(y+PFDIST+3)*w+x, _MM_HINT_T1);
__m128i I0 = _mm_loadu_si128 ((__m128i*)(src+y*w+x));__m128i I1 = _mm_loadu_si128 ((__m128i*)(src+(y+1)*w+x));__m128i I2 = _mm_loadu_si128 ((__m128i*)(src+(y+2)*w+x));__m128i I3 = _mm_loadu_si128 ((__m128i*)(src+(y+3)*w+x));__m128i T0 = _mm_unpacklo_epi32(I0, I1);
__m128i T1 = _mm_unpacklo_epi32(I2, I3);
__m128i T2 = _mm_unpackhi_epi32(I0, I1);
__m128i T3 = _mm_unpackhi_epi32(I2, I3);I0 = _mm_unpacklo_epi64(T0, T1);I1 = _mm_unpackhi_epi64(T0, T1);I2 = _mm_unpacklo_epi64(T2, T3);I3 = _mm_unpackhi_epi64(T2, T3);_mm_storeu_si128((__m128i*)(dst+(x*h)+y), I0);_mm_storeu_si128((__m128i*)(dst+((x+1)*h)+y), I1);_mm_storeu_si128((__m128i*)(dst+((x+2)*h)+y), I2);_mm_storeu_si128((__m128i*)(dst+((x+3)*h)+y), I3);}
}
}
接著是測試程式(請將上述程式碼置於指定位置)
#include#include
#include#include//for using x86-SSE2 intrinsics you have to include this #include // PUT naive_transpose, sse_transpose, sse_prefetch_transpose HERE// ...#define TEST_W 4096
#define TEST_H 4096 int main(void){//verify the result of 4x4 matrix{int testin[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };int testout[16]; for(int y = 0; y < 4; y++){for(int x = 0; x < 4; x++)printf(" %2d", testin[y*4+x]);printf("\n");
}printf("\n");
sse_transpose(testin, testout, 4, 4);for(int y = 0; y < 4; y++){for(int x = 0; x < 4; x++)printf(" %2d", testout[y*4+x]);printf("\n");
}}{
struct timeval stime, etime;int *src = (int*)malloc(sizeof(int)*TEST_W*TEST_H);
int *out0 = (int*)malloc(sizeof(int)*TEST_W*TEST_H);int *out1 = (int*)malloc(sizeof(int)*TEST_W*TEST_H);int *out2 = (int*)malloc(sizeof(int)*TEST_W*TEST_H);
srand(time(NULL));for(int y = 0; y < TEST_H; y++){for(int x = 0; x < TEST_W; x++)
*(src + y*TEST_W + x) = rand();
} gettimeofday(&stime, NULL);sse_prefetch_transpose(src, out0, TEST_W, TEST_H);
gettimeofday(&etime, NULL);
printf("sse prefetch: %ld us\n", (etime.tv_sec - stime.tv_sec)*1000000 + (etime.tv_usec - stime.tv_usec));gettimeofday(&stime, NULL);sse_transpose(src, out1, TEST_W, TEST_H);
gettimeofday(&etime, NULL);printf("sse: %ld us\n", (etime.tv_sec - stime.tv_sec)*1000000 + (etime.tv_usec - stime.tv_usec));gettimeofday(&stime, NULL);naive_transpose(src, out2, TEST_W, TEST_H);
gettimeofday(&etime, NULL);printf("naive: %ld us\n", (etime.tv_sec - stime.tv_sec)*1000000 + (etime.tv_usec - stime.tv_usec)); free(src);free(out0);free(out1);free(out2);} return 0;}
存成 test.c, 並且按照下列方式編譯
gcc -msse2 -o test test.c
於個人的 Core i7 3612QM 執行得到下列結果
0 1 2 34 5 6 7
8 9 10 1112 13 14 150 4 8 121 5 9 132 6 10 14
3 7 11 15sse prefetch: 57782 us
sse: 117184 usnaive: 228870 us
儘管現今 CPU 有著強大的 auto-prefetcher, 但是並不是所有的演算都有著規律或是線型的記憶體存取模式, 如此透過優化 prefetching 依然是能夠有相當的效能增進.
沒有留言:
張貼留言