[quake3-commits] r2170 - trunk/code/asm
DONOTREPLY at icculus.org
DONOTREPLY at icculus.org
Mon Sep 19 14:30:25 EDT 2011
Author: thilo
Date: 2011-09-19 14:30:24 -0400 (Mon, 19 Sep 2011)
New Revision: 2170
Modified:
trunk/code/asm/snapvector.asm
trunk/code/asm/snapvector.c
Log:
Implement Mathias Benthrup's suggestion for x86 ASM snapvector implementation which reduces cache misses.
Modified: trunk/code/asm/snapvector.asm
===================================================================
--- trunk/code/asm/snapvector.asm 2011-09-19 15:49:45 UTC (rev 2169)
+++ trunk/code/asm/snapvector.asm 2011-09-19 18:30:24 UTC (rev 2170)
@@ -48,14 +48,15 @@
stmxcsr [rsp] ; save SSE control word
ldmxcsr ssecw ; set to round nearest
- push rdi
- mov rdi, rcx ; maskmovdqu uses rdi as implicit memory operand
- movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
- movups xmm0, [rdi] ; here is stored our vector. Read 4 values in one go
+ movaps xmm1, ssemask ; initialize the mask register
+ movups xmm0, [rcx] ; here is stored our vector. Read 4 values in one go
+ movaps xmm2, xmm0 ; keep a copy of the original data
+ andps xmm0, xmm1 ; set the fourth value to zero in xmm0
+ andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
- maskmovdqu xmm0, xmm1 ; write 3 values back to memory
- pop rdi
+ orps xmm0, xmm1 ; combine all 4 values again
+ movups [rcx], xmm0 ; write 3 rounded and 1 unchanged values back to memory
ldmxcsr [rsp] ; restore sse control word to old value
add rsp, 8
@@ -69,14 +70,16 @@
stmxcsr [esp] ; save SSE control word
ldmxcsr ssecw ; set to round nearest
- push edi
- mov edi, dword ptr 16[esp] ; maskmovdqu uses edi as implicit memory operand
- movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
- movups xmm0, [edi] ; here is stored our vector. Read 4 values in one go
+ mov eax, dword ptr 16[esp] ; store address of vector in eax
+ movaps xmm1, ssemask ; initialize the mask register for maskmovdqu
+ movups xmm0, [eax] ; here is stored our vector. Read 4 values in one go
+ movaps xmm2, xmm0 ; keep a copy of the original data
+ andps xmm0, xmm1 ; set the fourth value to zero in xmm0
+ andnps xmm1, xmm2 ; copy fourth value to xmm1 and set rest to zero
cvtps2dq xmm0, xmm0 ; convert 4 single fp to int
cvtdq2ps xmm0, xmm0 ; convert 4 int to single fp
- maskmovdqu xmm0, xmm1 ; write 3 values back to memory
- pop edi
+ orps xmm0, xmm1 ; combine all 4 values again
+ movups [eax], xmm0 ; write 3 rounded and 1 unchanged values back to memory
ldmxcsr [esp] ; restore sse control word to old value
add esp, 8
Modified: trunk/code/asm/snapvector.c
===================================================================
--- trunk/code/asm/snapvector.c 2011-09-19 15:49:45 UTC (rev 2169)
+++ trunk/code/asm/snapvector.c 2011-09-19 18:30:24 UTC (rev 2170)
@@ -47,17 +47,18 @@
"movaps (%0), %%xmm1\n"
"movups (%2), %%xmm0\n"
+ "movaps %%xmm0, %%xmm2\n"
+ "andps %%xmm1, %%xmm0\n"
+ "andnps %%xmm2, %%xmm1\n"
"cvtps2dq %%xmm0, %%xmm0\n"
"cvtdq2ps %%xmm0, %%xmm0\n"
- // vec MUST reside in register rdi as maskmovdqu uses
- // it as an implicit operand. The "D" constraint makes
- // sure of that.
- "maskmovdqu %%xmm1, %%xmm0\n"
+ "orps %%xmm1, %%xmm0\n"
+ "movups %%xmm0, (%2)\n"
"ldmxcsr %3\n"
:
- : "r" (ssemask), "m" (ssecw), "D" (vec), "m" (oldcw)
- : "memory", "%xmm0", "%xmm1"
+ : "r" (ssemask), "m" (ssecw), "r" (vec), "m" (oldcw)
+ : "memory", "%xmm0", "%xmm1", "%xmm2"
);
}
More information about the quake3-commits
mailing list