Index: /trunk/source/level-0/ARM/arm-misc.lisp
===================================================================
--- /trunk/source/level-0/ARM/arm-misc.lisp	(revision 15080)
+++ /trunk/source/level-0/ARM/arm-misc.lisp	(revision 15081)
@@ -20,4 +20,5 @@
 (in-package "CCL")
 
+  
 ;;; Copy N bytes from pointer src, starting at byte offset src-offset,
 ;;; to ivector dest, starting at offset dest-offset.
@@ -27,9 +28,23 @@
 ;;; Does no arg checking of any kind.  Really.
 
-(defarmlapfunction %copy-ptr-to-ivector ((src (* 1 arm::node-size) )
-                                         (src-byte-offset 0) 
-                                         (dest arg_x)
-                                         (dest-byte-offset arg_y)
-                                         (nbytes arg_z))
+
+(defun %copy-ptr-to-ivector (src src-byte-offset dest dest-byte-offset nbytes)
+  (declare (fixnum src-byte-offset dest-byte-offset nbytes)
+           (optimize (speed 3) (safety 0)))
+  (let* ((ptr-align (logand 7 (%ptr-to-int src))))
+    (declare (type (mod 8) ptr-align))
+    (if (and (>= nbytes 32)
+             (= 0 (logand nbytes 3))
+             (= 0 (logand dest-byte-offset 3))
+             (= 0 (logand (the fixnum (+ ptr-align src-byte-offset)) 3)))
+      (%copy-ptr-to-ivector-32bit src src-byte-offset dest dest-byte-offset nbytes)
+      (%copy-ptr-to-ivector-8bit src src-byte-offset dest dest-byte-offset nbytes))
+    dest))
+            
+(defarmlapfunction %copy-ptr-to-ivector-8bit ((src (* 1 arm::node-size) )
+                                               (src-byte-offset 0) 
+                                               (dest arg_x)
+                                               (dest-byte-offset arg_y)
+                                               (nbytes arg_z))
   (let ((src-reg imm0)
         (src-byteptr temp2)
@@ -57,9 +72,89 @@
     (bx lr)))
 
-(defarmlapfunction %copy-ivector-to-ptr ((src (* 1 arm::node-size))
-                                         (src-byte-offset 0)
-                                         (dest arg_x)
-                                         (dest-byte-offset arg_y)
-                                         (nbytes arg_z))
+;;; Everything's aligned OK and NBYTES is a multiple of 4.
+(defarmlapfunction %copy-ptr-to-ivector-32bit ((src (* 1 arm::node-size) )
+                                               (src-byte-offset 0) 
+                                               (dest arg_x)
+                                               (dest-byte-offset arg_y)
+                                               (nbytes arg_z))
+  (add imm1 vsp (:$ (* 2 arm::node-size)))
+  (build-lisp-frame imm0 imm1)
+  (add lr dest (:$ arm::misc-data-offset))
+  (add lr lr (:asr dest-byte-offset (:$ arm::fixnumshift)))
+  (ldr temp0 (:@ vsp (:$ src)))
+  (ldr imm1 (:@ vsp (:$ src-byte-offset)))
+  (macptr-ptr imm0 temp0)
+  (add imm0 imm0 (:asr imm1 (:$ arm::fixnumshift)))
+  (b @test32)
+  @loop32
+  (fldmias s0 (:! imm0) 8)
+  (fstmias s0 (:! lr) 8)
+  (sub nbytes nbytes '32)
+  @test32
+  (cmp nbytes '32)
+  (bge @loop32)
+  (add pc pc (:asr nbytes (:$ arm::fixnumshift)))
+  (nop)
+  (b @0)
+  (b @4)
+  (b @8)
+  (b @12)
+  (b @16)
+  (b @20)
+  (b @24)
+  (b @28)
+  (nop)
+  @0
+  (mov arg_z dest)
+  (restore-lisp-frame imm0)
+  (bx lr)
+  @4
+  (flds s0 (:@ imm0 (:$ 0)))
+  (fsts s0 (:@ lr (:$ 0)))
+  (b @0)
+  @8
+  (fldmias s0 imm0 2)
+  (fstmias s0 lr 2)
+  (b @0)
+  @12
+  (fldmias s0 imm0 3)
+  (fstmias s0 lr 3)
+  (b @0)
+  @16
+  (fldmias s0 imm0 4)
+  (fstmias s0 lr 4)
+  (b @0)
+  @20
+  (fldmias s0 imm0 5)
+  (fstmias s0 lr 5)
+  (b @0)
+  @24
+  (fldmias s0 imm0 6)
+  (fstmias s0 lr 6)
+  (b @0)
+  @28
+  (fldmias s0 imm0 7)
+  (fstmias s0 lr 7)
+  (b @0))
+  
+
+(defun %copy-ivector-to-ptr (src src-byte-offset dest dest-byte-offset nbytes)
+  (declare (fixnum src-byte-offset dest-byte-offset nbytes)
+           (optimize (speed 3) (safety 0)))
+  (let* ((ptr-align (logand (the (unsigned-byte 32)(%ptr-to-int dest)) 7)))
+    (declare (type (mod 8) ptr-align))
+    (if (or (< nbytes 32)
+            (not (= 0 (logand nbytes 3)))
+            (not (= 0 (logand src-byte-offset 3)))
+            (not (= 0 (logand (the fixnum (+ ptr-align dest-byte-offset)) 3))))
+      (%copy-ivector-to-ptr-8bit src src-byte-offset dest dest-byte-offset nbytes)
+      (%copy-ivector-to-ptr-32bit src src-byte-offset dest dest-byte-offset nbytes))
+    dest))
+
+(defarmlapfunction %copy-ivector-to-ptr-8bit ((src (* 1 arm::node-size))
+                                              (src-byte-offset 0)
+                                              (dest arg_x)
+                                              (dest-byte-offset arg_y)
+                                              (nbytes arg_z))
   (ldr temp0 (:@ vsp (:$ src)))
   (cmp nbytes (:$ 0))
@@ -81,65 +176,263 @@
   (bx lr))
 
-(defarmlapfunction %copy-ivector-to-ivector ((src 4) 
-                                             (src-byte-offset 0) 
-                                             (dest arg_x)
-                                             (dest-byte-offset arg_y)
-                                             (nbytes arg_z))
+;;; Everything's aligned OK and NBYTES is a multiple of 4.
+(defarmlapfunction %copy-ivector-to-ptr-32bit ((src (* 1 arm::node-size) )
+                                               (src-byte-offset 0) 
+                                               (dest arg_x)
+                                               (dest-byte-offset arg_y)
+                                               (nbytes arg_z))
+  (add imm1 vsp (:$ (* 2 arm::node-size)))
+  (build-lisp-frame imm0 imm1)
+  (ldr temp0 (:@ vsp (:$ src)))
+  (ldr imm1 (:@ vsp (:$ src-byte-offset)))
+  (add lr temp0 (:$ arm::misc-data-offset))
+  (add lr lr (:asr imm1 (:$ arm::fixnumshift)))
+  (macptr-ptr imm0 dest)
+  (add imm0 imm0 (:asr dest-byte-offset (:$ arm::fixnumshift)))
+  (b @test32)
+  @loop32
+  (fldmias s0 (:! lr) 8)
+  (fstmias s0 (:! imm0) 8)
+  (sub nbytes nbytes '32)
+  @test32
+  (cmp nbytes '32)
+  (bge @loop32)
+  (add pc pc (:asr nbytes (:$ arm::fixnumshift)))
+  (nop)
+  (b @0)
+  (b @4)
+  (b @8)
+  (b @12)
+  (b @16)
+  (b @20)
+  (b @24)
+  (b @28)
+  (nop)
+  @0
+  (mov arg_z dest)
+  (restore-lisp-frame imm0)
+  (bx lr)
+  @4
+  (flds s0 (:@ lr (:$ 0)))
+  (fsts s0 (:@ imm0 (:$ 0)))
+  (b @0)
+  @8
+  (fldmias s0 lr 2)
+  (fstmias s0 imm0 2)
+  (b @0)
+  @12
+  (fldmias s0 lr 3)
+  (fstmias s0 imm0 3)
+  (b @0)
+  @16
+  (fldmias s0 lr 4)
+  (fstmias s0 imm0 4)
+  (b @0)
+  @20
+  (fldmias s0 lr 5)
+  (fstmias s0 imm0 5)
+  (b @0)
+  @24
+  (fldmias s0 lr 6)
+  (fstmias s0 imm0 6)
+  (b @0)
+  @28
+  (fldmias s0 lr 7)
+  (fstmias s0 imm0 7)
+  (b @0))
+
+
+(defun %copy-ivector-to-ivector (src src-byte-offset dest dest-byte-offset nbytes)
+  (declare (fixnum src-byte-offset dest-byte-offset nbytes))
+  (if (or (not (eq src dest))
+          (< dest-byte-offset src-byte-offset)
+          (>= dest-byte-offset (the fixnum (+ src-byte-offset nbytes))))
+    (%copy-ivector-to-ivector-postincrement src src-byte-offset dest dest-byte-offset nbytes)
+    (if (and (eq src dest)
+             (eql src-byte-offset dest-byte-offset))
+      dest
+      (%copy-ivector-to-ivector-predecrement src
+                                             (the fixnum (+ src-byte-offset nbytes))
+                                             dest
+                                             (the fixnum (+ dest-byte-offset nbytes))
+                                             nbytes)))
+  dest)
+
+(defun %copy-ivector-to-ivector-postincrement (src src-byte-offset dest dest-byte-offset nbytes)
+  (declare (fixnum src-byte-offset dest-byte-offset nbytes))
+  
+  (cond ((or (< nbytes 8)
+             (not (= (logand src-byte-offset 3)
+                     (logand dest-byte-offset 3))))
+         (%copy-ivector-to-ivector-postincrement-8bit src src-byte-offset dest dest-byte-offset nbytes))
+        (t
+         (let* ((prefix-size (- 4 (logand src-byte-offset 3))))
+           (declare (fixnum prefix-size))
+           (unless (= 4 prefix-size)
+             (%copy-ivector-to-ivector-postincrement-8bit src src-byte-offset dest dest-byte-offset prefix-size)
+             (incf src-byte-offset prefix-size)
+             (incf dest-byte-offset prefix-size)
+             (decf nbytes prefix-size)))
+         (let* ((tail-size (logand nbytes 3))
+                (fullword-size (- nbytes tail-size)))
+           (declare (fixnum tail-size fullword-size))
+           (unless (zerop fullword-size)
+             (%copy-ivector-to-ivector-postincrement-32bit src src-byte-offset dest dest-byte-offset fullword-size))
+           (unless (zerop tail-size)
+             (%copy-ivector-to-ivector-postincrement-8bit src (the fixnum (+ src-byte-offset fullword-size)) dest (the fixnum (+ dest-byte-offset fullword-size)) tail-size))))))
+
+(defun %copy-ivector-to-ivector-predecrement (src src-byte-offset dest dest-byte-offset nbytes)
+  (declare (fixnum src-byte-offset dest-byte-offset nbytes))
+  (cond ((or (< nbytes 8)
+             (not (= (logand src-byte-offset 3)
+                     (logand dest-byte-offset 3))))
+         (%copy-ivector-to-ivector-predecrement-8bit src src-byte-offset dest dest-byte-offset nbytes))
+    (t
+      (let* ((suffix-size (logand src-byte-offset 3)))
+        (declare (fixnum suffix-size))
+        (unless (zerop suffix-size)
+          (%copy-ivector-to-ivector-predecrement-8bit src src-byte-offset dest dest-byte-offset suffix-size)
+          (decf src-byte-offset suffix-size)
+          (decf dest-byte-offset suffix-size)
+          (decf nbytes suffix-size)))
+      (let* ((head-size (logand nbytes 3))
+             (fullword-size (- nbytes head-size)))
+        (declare (fixnum head-size fullword-size))
+        (unless (zerop fullword-size)
+          (%copy-ivector-to-ivector-predecrement-32bit src src-byte-offset dest dest-byte-offset fullword-size))
+        (unless (zerop head-size)
+          (%copy-ivector-to-ivector-predecrement-8bit src (the fixnum (- src-byte-offset fullword-size)) dest (the fixnum (- dest-byte-offset fullword-size)) head-size))))
+))
+
+(defarmlapfunction %copy-ivector-to-ivector-postincrement-8bit ((src 4) 
+                                                                (src-byte-offset 0) 
+                                                                (dest arg_x)
+                                                                (dest-byte-offset arg_y)
+                                                                (nbytes arg_z))
   (let ((rsrc temp0)
         (scaled-src-idx imm1)
         (scaled-dest-idx imm2)
-        (val imm0)
-        (nwords dest-byte-offset))
+        (val imm0))
     (cmp nbytes (:$ 0))
     (vpop1 scaled-src-idx)
     (mov scaled-src-idx (:lsr scaled-src-idx (:$ arm::fixnumshift)))
-    (mov val scaled-src-idx)
     (add scaled-src-idx scaled-src-idx (:$ arm::misc-data-offset))
+    (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
+    (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset))    
     (vpop1 rsrc)
-    (beq @done)
-    (cmp rsrc dest)
-    (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
-    (orr val val scaled-dest-idx)
-    (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset))
-    (beq @SisD)
-    @fwd
-    (tst val (:$ 3))
-    (bne @loop)
-    ;; src and dest offsets are word-aligned. Copy words.
-    (b @wtest)
-    @words                              ; source and dest different - words 
-    (sub nbytes nbytes '4)  
-    (ldr val (:@ rsrc scaled-src-idx))
-    (add scaled-src-idx scaled-src-idx '1)
-    (str val (:@ dest scaled-dest-idx))
-    (add scaled-dest-idx scaled-dest-idx '1)
-    @wtest
-    (cmp nbytes '4)
-    (bge @words)
-    (cmp nbytes '0)
     (b @test)
     @loop
     (subs nbytes nbytes '1)
-    (ldrb val (:@ temp0 scaled-src-idx))
+    (ldrb val (:@ rsrc scaled-src-idx))
     (add scaled-src-idx scaled-src-idx (:$ 1))
     (strb val (:@ dest scaled-dest-idx))
     (add scaled-dest-idx scaled-dest-idx (:$ 1))
     @test
-    (bne  @loop)
-    @done
+    (bne @loop)
     (mov arg_z dest)
-    (bx lr)
-
-    @SisD
-    (cmp scaled-src-idx scaled-dest-idx) ; cmp src and dest
-    (beq @done)
-    (bgt @fwd)
-
-  
-    ;; Copy backwards when src & dest are the same and we're sliding down
-    @bwd
-    (add scaled-src-idx scaled-src-idx (:lsr nbytes (:$ arm::fixnumshift)))
-    (add scaled-dest-idx scaled-dest-idx (:lsr nbytes (:$ arm::fixnumshift)))
-    @loop2
+    (bx lr)))
+
+(defarmlapfunction %copy-ivector-to-ivector-postincrement-32bit ((src 4) 
+                                                                 (src-byte-offset 0) 
+                                                                 (dest arg_x)
+                                                                 (dest-byte-offset arg_y)
+                                                                 (nbytes arg_z))
+  (let ((rsrc temp0)
+        (scaled-src-idx imm1)
+        (scaled-dest-idx imm2)
+        (val imm0))
+    (cmp nbytes '32)
+    (vpop1 scaled-src-idx)
+    (mov scaled-src-idx (:lsr scaled-src-idx (:$ arm::fixnumshift)))
+    (add scaled-src-idx scaled-src-idx (:$ arm::misc-data-offset))
+    (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
+    (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset))    
+    (vpop1 rsrc)
+    (build-lisp-frame imm0)             
+    (b @test)
+    @loop
+    (sub nbytes nbytes '32)
+    (cmp nbytes '32)
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 8)
+    (add scaled-src-idx scaled-src-idx (:$ 32))
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 8)
+    (add scaled-dest-idx scaled-dest-idx (:$ 32))
+    @test
+    (bge @loop)
+    (add pc pc (:asr nbytes (:$ arm::fixnumshift)))
+    (nop)
+    (b @0)
+    (b @4)
+    (b @8)
+    (b @12)
+    (b @16)
+    (b @20)
+    (b @24)
+    (b @28)
+    (nop)
+    @4
+    (ldr val (:@ rsrc scaled-src-idx))
+    (str val (:@ dest scaled-dest-idx))
+    (b @0)
+    @8
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 2)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 2)
+    (b @0)
+    @12
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 3)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 3)
+    (b @0)
+    @16
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 4)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 4)
+    (b @0)
+    @20
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 5)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 5)
+    (b @0)
+    @24
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 6)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 6)
+    (b @0)
+    @28
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 7)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 7)
+    @0
+    (mov arg_z dest)
+    (restore-lisp-frame imm0)
+    (bx lr)))
+
+(defarmlapfunction %copy-ivector-to-ivector-predecrement-8bit ((src 4) 
+                                                               (src-byte-offset 0) 
+                                                               (dest arg_x)
+                                                               (dest-byte-offset arg_y)
+                                                               (nbytes arg_z))
+  (let ((rsrc temp0)
+        (scaled-src-idx imm1)
+        (scaled-dest-idx imm2)
+        (val imm0))
+    (cmp nbytes (:$ 0))
+    (vpop1 scaled-src-idx)
+    (mov scaled-src-idx (:lsr scaled-src-idx (:$ arm::fixnumshift)))
+    (add scaled-src-idx scaled-src-idx (:$ arm::misc-data-offset))
+    (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
+    (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset))    
+    (vpop1 rsrc)
+    (b @test)
+    @loop
     (sub scaled-src-idx scaled-src-idx (:$ 1))
     (sub scaled-dest-idx scaled-dest-idx (:$ 1))
@@ -147,10 +440,94 @@
     (ldrb val (:@ rsrc scaled-src-idx))
     (strb val (:@ dest scaled-dest-idx))
-    @test2
-    (bne @loop2)
-    (b @done)))
-
-
-  
+    @test
+    (bne @loop)
+    (mov arg_z dest)
+    (bx lr)))
+
+(defarmlapfunction %copy-ivector-to-ivector-predecrement-32bit ((src 4) 
+                                                                (src-byte-offset 0) 
+                                                                (dest arg_x)
+                                                                (dest-byte-offset arg_y)
+                                                                (nbytes arg_z))
+  (let ((rsrc temp0)
+        (scaled-src-idx imm1)
+        (scaled-dest-idx imm2)
+        (val imm0))
+    (cmp nbytes (:$ 32))
+    (vpop1 scaled-src-idx)
+    (mov scaled-src-idx (:lsr scaled-src-idx (:$ arm::fixnumshift)))
+    (add scaled-src-idx scaled-src-idx (:$ arm::misc-data-offset))
+    (mov scaled-dest-idx (:lsr dest-byte-offset (:$ arm::fixnumshift)))
+    (add scaled-dest-idx scaled-dest-idx (:$ arm::misc-data-offset))    
+    (vpop1 rsrc)
+    (build-lisp-frame imm0)             
+    (b @test)
+    @loop
+    (sub scaled-src-idx scaled-src-idx (:$ 32))
+    (sub scaled-dest-idx scaled-dest-idx (:$ 32))
+    (sub nbytes nbytes '32)
+    (cmp nbytes '32)
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 8)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 8)
+    @test
+    (bge @loop)
+    (sub scaled-src-idx scaled-src-idx (:asr nbytes (:$ arm::fixnumshift)))
+    (sub scaled-dest-idx scaled-dest-idx (:asr nbytes (:$ arm::fixnumshift)))
+    (add pc pc (:asr nbytes (:$ arm::fixnumshift)))
+    (nop)
+    (b @0)
+    (b @4)
+    (b @8)
+    (b @12)
+    (b @16)
+    (b @20)
+    (b @24)
+    (b @28)
+    (nop)
+    @4
+    (ldr val (:@ rsrc scaled-src-idx))
+    (str val (:@ dest scaled-dest-idx))
+    (b @0)
+    @8
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 2)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 2)
+    (b @0)
+    @12
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 3)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 3)
+    (b @0)
+    @16
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 4)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 4)
+    (b @0)
+    @20
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 5)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 5)
+    (b @0)
+    @24
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 6)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 6)
+    (b @0)
+    @28
+    (add lr rsrc scaled-src-idx)
+    (fldmias s0 lr 7)
+    (add lr dest scaled-dest-idx)
+    (fstmias s0 lr 7)
+    @0
+    (mov arg_z dest)
+    (restore-lisp-frame imm0)
+    (bx lr)))
 
 (defarmlapfunction %copy-gvector-to-gvector ((src (* 1 arm::node-size))
