[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: gcc-3.1.1



Tsubai Masanari <tsubai@iri.co.jp> writes:

> は
> 
> 	lwz 7,0(%2)
> 	lwz 8,4(%2)
> 	lwz 9,8(%2)
> 	lwz 10,12(%2)
> 	... 何度か繰り返し
> 
> 	(ループ内の最後で)
> 	addi %2,%2,64
> 
> のような形にした方がいいということです。

下のように変更して, 手許の Mac (500DP) で, とある data に in4_cksum を 
1000000 回かけてみたところ, user time は 13.9s から 12.4s に減りました. 
でも cpu によっては lwzu のほうが速かったりするとしたら, 単にこう変更
するわけにはいきませんね.

enami.
Index: in_cksum.c
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/powerpc/powerpc/in_cksum.c,v
retrieving revision 1.3
diff -u -r1.3 in_cksum.c
--- in_cksum.c	2001/06/13 06:01:50	1.3
+++ in_cksum.c	2002/07/29 09:57:53
@@ -103,7 +103,7 @@
 		 * Force to a word boundary.
 		 */
 		if ((3 & (long) w) && (mlen > 0)) {
-			if ((1 & (long) w) && (mlen > 0)) {
+			if ((1 & (long) w)) {
 				REDUCE;
 				sum <<= 8;
 				s_util.c[0] = *w++;
@@ -111,7 +111,18 @@
 				byte_swapped = 1;
 			}
 			if ((2 & (long) w) && (mlen > 1)) {
-				sum += *(uint16_t *)w;
+				/*
+				 * Since the `sum' may contain full 32 bit
+				 * value, we can't simply add any value.
+				 */
+				__asm __volatile(
+				    "lhz 7,0(%1);"	/* load current data
+							   half word */
+				    "addc %0,%0,7;"	/* add to sum */
+				    "addze %0,%0;"	/* add carry bit */
+				    : "+r"(sum)
+				    : "b"(w)
+				    : "7");		/* clobber r7 */
 				w += 2;
 				mlen -= 2;
 			}
@@ -119,75 +130,72 @@
 
 		if (mlen >= 64) {
 			register int n __asm("r0");
-			uint8_t *tmpw;
 
 			n = mlen >> 6;
-			tmpw = w - 4;
 			asm volatile(
-				"addze 7,7;"		/* clear carry */
-				"mtctr %1;"		/* load loop count */
+				"addic 0,0,0;"		/* clear carry */
+				"mtctr %2;"		/* load loop count */
 				"1:"
-				"lwzu 7,4(%2);"		/* load current data word */
-				"lwzu 8,4(%2);"
-				"lwzu 9,4(%2);"
-				"lwzu 10,4(%2);"
+				"lwz 7,0(%1);"		/* load current data word */
+				"lwz 8,4(%1);"
+				"lwz 9,8(%1);"
+				"lwz 10,12(%1);"
 				"adde %0,%0,7;"		/* add to sum */
 				"adde %0,%0,8;"
 				"adde %0,%0,9;"
 				"adde %0,%0,10;"
-				"lwzu 7,4(%2);"
-				"lwzu 8,4(%2);"
-				"lwzu 9,4(%2);"
-				"lwzu 10,4(%2);"
+				"lwz 7,16(%1);"
+				"lwz 8,20(%1);"
+				"lwz 9,24(%1);"
+				"lwz 10,28(%1);"
 				"adde %0,%0,7;"
 				"adde %0,%0,8;"
 				"adde %0,%0,9;"
 				"adde %0,%0,10;"
-				"lwzu 7,4(%2);"
-				"lwzu 8,4(%2);"
-				"lwzu 9,4(%2);"
-				"lwzu 10,4(%2);"
+				"lwz 7,32(%1);"
+				"lwz 8,36(%1);"
+				"lwz 9,40(%1);"
+				"lwz 10,44(%1);"
 				"adde %0,%0,7;"
 				"adde %0,%0,8;"
 				"adde %0,%0,9;"
 				"adde %0,%0,10;"
-				"lwzu 7,4(%2);"
-				"lwzu 8,4(%2);"
-				"lwzu 9,4(%2);"
-				"lwzu 10,4(%2);"
+				"lwz 7,48(%1);"
+				"lwz 8,52(%1);"
+				"lwz 9,56(%1);"
+				"lwz 10,60(%1);"
 				"adde %0,%0,7;"
 				"adde %0,%0,8;"
 				"adde %0,%0,9;"
 				"adde %0,%0,10;"
+				"addi %1,%1,64;"
 				"bdnz 1b;"		/* loop */
 				"addze %0,%0;"		/* add carry bit */
-				: "+r"(sum)
-				: "r"(n), "r"(tmpw)
+				: "+r"(sum), "+b"(w)
+				: "r"(n)
 				: "7", "8", "9", "10");	/* clobber r7, r8, r9, r10 */
-			w += n * 64;
 			mlen -= n * 64;
 		}
 
 		if (mlen >= 8) {
 			register int n __asm("r0");
-			uint8_t *tmpw;
 
 			n = mlen >> 3;
-			tmpw = w - 4;
 			asm volatile(
-				"addze %1,%1;"		/* clear carry */
-				"mtctr %1;"		/* load loop count */
+				"addic 0,0,0;"		/* clear carry */
+				"mtctr %2;"		/* load loop count */
 				"1:"
-				"lwzu 7,4(%2);"		/* load current data word */
-				"lwzu 8,4(%2);"
+				"lwz 7,0(%1);"		/* load current data
+							   word */
+				"lwz 8,4(%1);"
 				"adde %0,%0,7;"		/* add to sum */
 				"adde %0,%0,8;"
+				"addi %1,%1,8;"
 				"bdnz 1b;"		/* loop */
 				"addze %0,%0;"		/* add carry bit */
-				: "+r"(sum)
-				: "r"(n), "r"(tmpw)
+				: "+r"(sum), "+b"(w)
+				: "r"(n)
 				: "7", "8");		/* clobber r7, r8 */
-			w += n * 8;
 			mlen -= n * 8;
 		}