128-bit MMX versus 32-bit memory copy

  • Follow


In a certain app, the code performs lots of memory copies in 128-byte
chunks between many different locations. I figured I'd get a
significant speedup if I used MMX 128-bit mov instructions to implement
the memory copy. However, the performance is anywhere from 33% to 20%
of what I get using regular 32-bit mov instructions. Why is the
performance so poor for MMX? Too much overhead to be beneficial for
128-byte memory copies?

The cycle counts I'm getting on my 2.3 P4 is around 2000-2500 for the
normal 32-bit memory copy and 6000-10000 for the MMX 128-bit memory
copy.

Here's the code in MS .NET:

//#define USE_MMX

__declspec(align(16)) struct {
  unsigned char a[128];
} addr1;

__declspec(align(16)) struct {
  unsigned char a[128];
} addr2;

__declspec(naked) unsigned __int64 clock( ) {
  __asm {
    rdtsc
    ret
  }
}

__forceinline void assignmmx( ) {
  __asm {
    movdqa xmm0, xmmword ptr[addr1+0]
    movdqa xmmword ptr[addr2+0],  xmm0
    movdqa xmm0, xmmword ptr[addr1+16]
    movdqa xmmword ptr[addr2+16], xmm0
    movdqa xmm0, xmmword ptr[addr1+32]
    movdqa xmmword ptr[addr2+32], xmm0
    movdqa xmm0, xmmword ptr[addr1+48]
    movdqa xmmword ptr[addr2+48], xmm0
    movdqa xmm0, xmmword ptr[addr1+64]
    movdqa xmmword ptr[addr2+64], xmm0
    movdqa xmm0, xmmword ptr[addr1+80]
    movdqa xmmword ptr[addr2+80], xmm0
    movdqa xmm0, xmmword ptr[addr1+96]
    movdqa xmmword ptr[addr2+96], xmm0
    movdqa xmm0, xmmword ptr[addr1+112]
    movdqa xmmword ptr[addr2+112], xmm0
  }
}

__forceinline void assign( ) {
  __asm {
    mov eax, dword ptr[addr1+0]
    mov dword ptr[addr2+0], eax
    mov eax, dword ptr[addr1+4]
    mov dword ptr[addr2+4], eax
    mov eax, dword ptr[addr1+8]
    mov dword ptr[addr2+8], eax
    mov eax, dword ptr[addr1+12]
    mov dword ptr[addr2+12], eax
    mov eax, dword ptr[addr1+16]
    mov dword ptr[addr2+16], eax
    mov eax, dword ptr[addr1+20]
    mov dword ptr[addr2+20], eax
    mov eax, dword ptr[addr1+24]
    mov dword ptr[addr2+24], eax
    mov eax, dword ptr[addr1+28]
    mov dword ptr[addr2+28], eax
    mov eax, dword ptr[addr1+32]
    mov dword ptr[addr2+32], eax
    mov eax, dword ptr[addr1+36]
    mov dword ptr[addr2+36], eax
    mov eax, dword ptr[addr1+40]
    mov dword ptr[addr2+40], eax
    mov eax, dword ptr[addr1+44]
    mov dword ptr[addr2+44], eax
    mov eax, dword ptr[addr1+48]
    mov dword ptr[addr2+48], eax
    mov eax, dword ptr[addr1+52]
    mov dword ptr[addr2+52], eax
    mov eax, dword ptr[addr1+56]
    mov dword ptr[addr2+56], eax
    mov eax, dword ptr[addr1+60]
    mov dword ptr[addr2+60], eax
    mov eax, dword ptr[addr1+64]
    mov dword ptr[addr2+64], eax
    mov eax, dword ptr[addr1+68]
    mov dword ptr[addr2+68], eax
    mov eax, dword ptr[addr1+72]
    mov dword ptr[addr2+72], eax
    mov eax, dword ptr[addr1+76]
    mov dword ptr[addr2+76], eax
    mov eax, dword ptr[addr1+80]
    mov dword ptr[addr2+80], eax
    mov eax, dword ptr[addr1+84]
    mov dword ptr[addr2+84], eax
    mov eax, dword ptr[addr1+88]
    mov dword ptr[addr2+88], eax
    mov eax, dword ptr[addr1+92]
    mov dword ptr[addr2+92], eax
    mov eax, dword ptr[addr1+96]
    mov dword ptr[addr2+96], eax
    mov eax, dword ptr[addr1+100]
    mov dword ptr[addr2+100], eax
    mov eax, dword ptr[addr1+104]
    mov dword ptr[addr2+104], eax
    mov eax, dword ptr[addr1+108]
    mov dword ptr[addr2+108], eax
    mov eax, dword ptr[addr1+112]
    mov dword ptr[addr2+112], eax
    mov eax, dword ptr[addr1+116]
    mov dword ptr[addr2+116], eax
    mov eax, dword ptr[addr1+120]
    mov dword ptr[addr2+120], eax
    mov eax, dword ptr[addr1+124]
    mov dword ptr[addr2+124], eax
  }
}


void sse2test( ) {
  unsigned __int64 clocktime1, clocktime2;

  clocktime1 = clock();
#ifdef USE_MMX
  assignmmx();
#else
  assign();
#endif
  clocktime2 = clock();
}

0
Reply spamtrap2 (1628) 5/11/2005 4:40:20 AM

spamtrap@crayne.org wrote:
> In a certain app, the code performs lots of memory copies in 128-byte
> chunks between many different locations. I figured I'd get a
> significant speedup if I used MMX 128-bit mov instructions to
> implement the memory copy. However, the performance is anywhere from
> 33% to 20% of what I get using regular 32-bit mov instructions. Why
> is the performance so poor for MMX? Too much overhead to be
> beneficial for 128-byte memory copies?

Are you talking about MMX (movq), which is 64-bit, or SSE (movdqa/movdqu, 
movaps/movups, movapd/movupd)? Looking at your code, it seems that you meant 
SSE.

> The cycle counts I'm getting on my 2.3 P4 is around 2000-2500 for the
> normal 32-bit memory copy and 6000-10000 for the MMX 128-bit memory
> copy.
[...]

Those are *way* too high. You should be seeing around 28 cycles for the SSE 
copy and around 32 for the integer copy. Add +/- 10 cycles variance to each 
due to rdtsc executing out of order and other uncontrollable factors. Either 
something is wrong, or you did not post the exact code that you timed.

By the way, you might improve your SSE performance by rewriting the code 
like so:

movdqa xmm0, [addr1]
movdqa xmm1, [addr1+16]
movdqa xmm2, [addr1+32]
movdqa xmm3, [addr1+48]
movdqa xmm4, [addr1+64]
movdqa xmm5, [addr1+80]
movdqa xmm6, [addr1+96]
movdqa xmm7, [addr1+112]

movdqa [addr2], xmm0
movdqa [addr2+16], xmm1
movdqa [addr2+32], xmm2
movdqa [addr2+48], xmm3
movdqa [addr2+64], xmm4
movdqa [addr2+80], xmm5
movdqa [addr2+96], xmm6
movdqa [addr2+112], xmm7

The latency of a movdqa instruction is 6 cycles. This rewrite ensures that a 
store is not issued until the corresponding load is completed. Also, it is 
supposedly better to burst loads and stores. It might not make a difference 
for you, but as I said, I am not very familiar with the P4 
microarchitecture.

-Matt 

0
Reply Matt 5/11/2005 7:04:46 PM


1 Replies
248 Views

(page loaded in 0.067 seconds)

Similiar Articles:













7/25/2012 1:32:10 PM


Reply: