[quake3-commits] r2170 - trunk/code/asm

DONOTREPLY at icculus.org DONOTREPLY at icculus.org
Mon Sep 19 14:30:25 EDT 2011


Author: thilo
Date: 2011-09-19 14:30:24 -0400 (Mon, 19 Sep 2011)
New Revision: 2170

Modified:
   trunk/code/asm/snapvector.asm
   trunk/code/asm/snapvector.c
Log:
Implement Mathias Benthrup's suggestion for x86 ASM snapvector implementation which reduces cache misses.


Modified: trunk/code/asm/snapvector.asm
===================================================================
--- trunk/code/asm/snapvector.asm	2011-09-19 15:49:45 UTC (rev 2169)
+++ trunk/code/asm/snapvector.asm	2011-09-19 18:30:24 UTC (rev 2170)
@@ -48,14 +48,15 @@
 	stmxcsr [rsp]				; save SSE control word
 	ldmxcsr ssecw				; set to round nearest
 
-    push rdi
-	mov rdi, rcx				; maskmovdqu uses rdi as implicit memory operand
-	movaps xmm1, ssemask		; initialize the mask register for maskmovdqu
-    movups xmm0, [rdi]			; here is stored our vector. Read 4 values in one go
+	movaps xmm1, ssemask		; initialize the mask register
+	movups xmm0, [rcx]			; here is stored our vector. Read 4 values in one go
+	movaps xmm2, xmm0			; keep a copy of the original data
+	andps xmm0, xmm1			; set the fourth value to zero in xmm0
+	andnps xmm1, xmm2			; copy fourth value to xmm1 and set rest to zero
 	cvtps2dq xmm0, xmm0			; convert 4 single fp to int
 	cvtdq2ps xmm0, xmm0			; convert 4 int to single fp
-	maskmovdqu xmm0, xmm1		; write 3 values back to memory
-	pop rdi
+	orps xmm0, xmm1				; combine all 4 values again
+	movups [rcx], xmm0			; write 3 rounded and 1 unchanged values back to memory
 
 	ldmxcsr [rsp]				; restore sse control word to old value
 	add rsp, 8
@@ -69,14 +70,16 @@
 	stmxcsr [esp]				; save SSE control word
 	ldmxcsr ssecw				; set to round nearest
 
-    push edi
-	mov edi, dword ptr 16[esp]	; maskmovdqu uses edi as implicit memory operand
-	movaps xmm1, ssemask		; initialize the mask register for maskmovdqu
-    movups xmm0, [edi]			; here is stored our vector. Read 4 values in one go
+	mov eax, dword ptr 16[esp]		; store address of vector in eax
+	movaps xmm1, ssemask			; initialize the mask register for maskmovdqu
+	movups xmm0, [eax]			; here is stored our vector. Read 4 values in one go
+	movaps xmm2, xmm0			; keep a copy of the original data
+	andps xmm0, xmm1			; set the fourth value to zero in xmm0
+	andnps xmm1, xmm2			; copy fourth value to xmm1 and set rest to zero
 	cvtps2dq xmm0, xmm0			; convert 4 single fp to int
 	cvtdq2ps xmm0, xmm0			; convert 4 int to single fp
-	maskmovdqu xmm0, xmm1		; write 3 values back to memory
-	pop edi
+	orps xmm0, xmm1				; combine all 4 values again
+	movups [eax], xmm0			; write 3 rounded and 1 unchanged values back to memory
 
 	ldmxcsr [esp]				; restore sse control word to old value
 	add esp, 8

Modified: trunk/code/asm/snapvector.c
===================================================================
--- trunk/code/asm/snapvector.c	2011-09-19 15:49:45 UTC (rev 2169)
+++ trunk/code/asm/snapvector.c	2011-09-19 18:30:24 UTC (rev 2170)
@@ -47,17 +47,18 @@
 
 		"movaps (%0), %%xmm1\n"
 		"movups (%2), %%xmm0\n"
+		"movaps %%xmm0, %%xmm2\n"
+		"andps %%xmm1, %%xmm0\n"
+		"andnps %%xmm2, %%xmm1\n"
 		"cvtps2dq %%xmm0, %%xmm0\n"
 		"cvtdq2ps %%xmm0, %%xmm0\n"
-		// vec MUST reside in register rdi as maskmovdqu uses
-		// it as an implicit operand. The "D" constraint makes
-		// sure of that.
-		"maskmovdqu %%xmm1, %%xmm0\n"
+		"orps %%xmm1, %%xmm0\n"
+		"movups %%xmm0, (%2)\n"
 		
 		"ldmxcsr %3\n"
 		:
-		: "r" (ssemask), "m" (ssecw), "D" (vec), "m" (oldcw)
-		: "memory", "%xmm0", "%xmm1"
+		: "r" (ssemask), "m" (ssecw), "r" (vec), "m" (oldcw)
+		: "memory", "%xmm0", "%xmm1", "%xmm2"
 	);
 	
 }



More information about the quake3-commits mailing list