In a certain app, the code performs lots of memory copies in 128-byte
chunks between many different locations. I figured I'd get a
significant speedup if I used MMX 128-bit mov instructions to implement
the memory copy. However, the performance is anywhere from 33% to 20%
of what I get using regular 32-bit mov instructions. Why is the
performance so poor for MMX? Too much overhead to be beneficial for
128-byte memory copies?
The cycle counts I'm getting on my 2.3 P4 is around 2000-2500 for the
normal 32-bit memory copy and 6000-10000 for the MMX 128-bit memory
copy.
Here's the code in MS .NET:
//#define USE_MMX
__declspec(align(16)) struct {
unsigned char a[128];
} addr1;
__declspec(align(16)) struct {
unsigned char a[128];
} addr2;
__declspec(naked) unsigned __int64 clock( ) {
__asm {
rdtsc
ret
}
}
__forceinline void assignmmx( ) {
__asm {
movdqa xmm0, xmmword ptr[addr1+0]
movdqa xmmword ptr[addr2+0], xmm0
movdqa xmm0, xmmword ptr[addr1+16]
movdqa xmmword ptr[addr2+16], xmm0
movdqa xmm0, xmmword ptr[addr1+32]
movdqa xmmword ptr[addr2+32], xmm0
movdqa xmm0, xmmword ptr[addr1+48]
movdqa xmmword ptr[addr2+48], xmm0
movdqa xmm0, xmmword ptr[addr1+64]
movdqa xmmword ptr[addr2+64], xmm0
movdqa xmm0, xmmword ptr[addr1+80]
movdqa xmmword ptr[addr2+80], xmm0
movdqa xmm0, xmmword ptr[addr1+96]
movdqa xmmword ptr[addr2+96], xmm0
movdqa xmm0, xmmword ptr[addr1+112]
movdqa xmmword ptr[addr2+112], xmm0
}
}
__forceinline void assign( ) {
__asm {
mov eax, dword ptr[addr1+0]
mov dword ptr[addr2+0], eax
mov eax, dword ptr[addr1+4]
mov dword ptr[addr2+4], eax
mov eax, dword ptr[addr1+8]
mov dword ptr[addr2+8], eax
mov eax, dword ptr[addr1+12]
mov dword ptr[addr2+12], eax
mov eax, dword ptr[addr1+16]
mov dword ptr[addr2+16], eax
mov eax, dword ptr[addr1+20]
mov dword ptr[addr2+20], eax
mov eax, dword ptr[addr1+24]
mov dword ptr[addr2+24], eax
mov eax, dword ptr[addr1+28]
mov dword ptr[addr2+28], eax
mov eax, dword ptr[addr1+32]
mov dword ptr[addr2+32], eax
mov eax, dword ptr[addr1+36]
mov dword ptr[addr2+36], eax
mov eax, dword ptr[addr1+40]
mov dword ptr[addr2+40], eax
mov eax, dword ptr[addr1+44]
mov dword ptr[addr2+44], eax
mov eax, dword ptr[addr1+48]
mov dword ptr[addr2+48], eax
mov eax, dword ptr[addr1+52]
mov dword ptr[addr2+52], eax
mov eax, dword ptr[addr1+56]
mov dword ptr[addr2+56], eax
mov eax, dword ptr[addr1+60]
mov dword ptr[addr2+60], eax
mov eax, dword ptr[addr1+64]
mov dword ptr[addr2+64], eax
mov eax, dword ptr[addr1+68]
mov dword ptr[addr2+68], eax
mov eax, dword ptr[addr1+72]
mov dword ptr[addr2+72], eax
mov eax, dword ptr[addr1+76]
mov dword ptr[addr2+76], eax
mov eax, dword ptr[addr1+80]
mov dword ptr[addr2+80], eax
mov eax, dword ptr[addr1+84]
mov dword ptr[addr2+84], eax
mov eax, dword ptr[addr1+88]
mov dword ptr[addr2+88], eax
mov eax, dword ptr[addr1+92]
mov dword ptr[addr2+92], eax
mov eax, dword ptr[addr1+96]
mov dword ptr[addr2+96], eax
mov eax, dword ptr[addr1+100]
mov dword ptr[addr2+100], eax
mov eax, dword ptr[addr1+104]
mov dword ptr[addr2+104], eax
mov eax, dword ptr[addr1+108]
mov dword ptr[addr2+108], eax
mov eax, dword ptr[addr1+112]
mov dword ptr[addr2+112], eax
mov eax, dword ptr[addr1+116]
mov dword ptr[addr2+116], eax
mov eax, dword ptr[addr1+120]
mov dword ptr[addr2+120], eax
mov eax, dword ptr[addr1+124]
mov dword ptr[addr2+124], eax
}
}
void sse2test( ) {
unsigned __int64 clocktime1, clocktime2;
clocktime1 = clock();
#ifdef USE_MMX
assignmmx();
#else
assign();
#endif
clocktime2 = clock();
}
|