Index: /trunk/source/level-0/X86/x86-misc.lisp
===================================================================
--- /trunk/source/level-0/X86/x86-misc.lisp	(revision 15035)
+++ /trunk/source/level-0/X86/x86-misc.lisp	(revision 15036)
@@ -63,8 +63,13 @@
                                          (nbytes arg_z))
   (let ((rsrc temp0)
-        (rsrc-byte-offset temp1))
+        (rsrc-byte-offset imm0)
+        (rdest-byte-offset imm1)
+        (rdata imm2))
+    (movq (@ src-byte-offset (% rsp)) (% rsrc-byte-offset))
+    (sarq ($ x8664::word-shift) (% rsrc-byte-offset))
+    (movq (% dest-byte-offset) (% rdest-byte-offset))
+    (sarq ($ x8664::word-shift) (% rdest-byte-offset))
+    (movq (@ src (% rsp)) (% rsrc))
     (testq (% nbytes) (% nbytes))
-    (movq (@ src-byte-offset (% rsp)) (% rsrc-byte-offset))
-    (movq (@ src (% rsp)) (% rsrc))
     (jmp @test)
     @loop
@@ -85,90 +90,377 @@
 (defun %copy-ivector-to-ivector (src src-byte-offset dest dest-byte-offset nbytes)
   (declare (fixnum src-byte-offset dest-byte-offset nbytes))
-  (if (or (eq src dest)
-          (not (eql 0 src-byte-offset))
-          (not (eql 0 dest-byte-offset))
-          (< nbytes 8))
-    (%copy-ivector-to-ivector-bytes src src-byte-offset dest dest-byte-offset nbytes)
-    (%copy-ivector-to-ivector-words src dest (ash nbytes -3) (logand nbytes 7))))
-
-(defx86lapfunction %copy-ivector-to-ivector-words ((src 8)
-                                                   #|(ra 0)|#
-                                                   (dest arg_x)
-                                                   (nwords arg_y)
-                                                   (nbytes arg_z))
+  (if (or (not (eq src dest))
+          (< dest-byte-offset src-byte-offset)
+          (>= dest-byte-offset (the fixnum (+ src-byte-offset nbytes))))
+    (%copy-ivector-to-ivector-postincrement src src-byte-offset dest dest-byte-offset nbytes)
+    (if (and (eq src dest)
+             (eql src-byte-offset dest-byte-offset))
+      dest
+      (%copy-ivector-to-ivector-predecrement src
+                                             (the fixnum (+ src-byte-offset nbytes))
+                                             dest
+                                             (the fixnum (+ dest-byte-offset nbytes))
+                                             nbytes)))
+  dest)
+
+(defun %copy-ivector-to-ivector-postincrement (src src-byte-offset dest dest-byte-offset nbytes)
+  (declare (fixnum src-byte-offset dest-byte-offset nbytes))
+  
+  (cond ((or (< nbytes 8)
+             (not (= (logand src-byte-offset 3)
+                     (logand dest-byte-offset 3))))
+         (%copy-ivector-to-ivector-postincrement-8bit src src-byte-offset dest dest-byte-offset nbytes))
+        ((and (>= nbytes 80)
+              (= (logand src-byte-offset 15)
+                 (logand dest-byte-offset 15)))
+         (let* ((prefix-size (- 16 (logand (the fixnum (+ src-byte-offset 8)) 15))))
+           (declare (fixnum prefix-size))
+           (unless (= 16 prefix-size)
+             (%copy-ivector-to-ivector-postincrement-8bit src src-byte-offset dest dest-byte-offset prefix-size)
+             (incf src-byte-offset prefix-size)
+             (incf dest-byte-offset prefix-size)
+             (decf nbytes prefix-size)))
+         (let* ((tail-size (logand nbytes 15))
+                (dqsize (- nbytes tail-size)))
+           (declare (fixnum tail-size dqsize))
+           (%copy-ivector-to-ivector-postincrement-128bit src src-byte-offset dest dest-byte-offset dqsize)
+           (unless (zerop tail-size)
+             (%copy-ivector-to-ivector-postincrement-8bit src (the fixnum (+ src-byte-offset dqsize)) dest (the fixnum (+ dest-byte-offset dqsize)) tail-size))))
+        ((= (logand src-byte-offset 7) (logand dest-byte-offset 7))
+         (let* ((prefix-size (- 8 (logand src-byte-offset 7))))
+           (declare (fixnum prefix-size))
+           (unless (= 8 prefix-size)
+             (%copy-ivector-to-ivector-postincrement-8bit src src-byte-offset dest dest-byte-offset prefix-size)
+             (incf src-byte-offset prefix-size)
+             (incf dest-byte-offset prefix-size)
+             (decf nbytes prefix-size)))
+         (let* ((tail-size (logand nbytes 7))
+                (fullword-size (- nbytes tail-size)))
+           (declare (fixnum tail-size fullword-size))
+           (unless (zerop fullword-size)
+             (%copy-ivector-to-ivector-postincrement-64bit src src-byte-offset dest dest-byte-offset fullword-size))
+           (unless (zerop tail-size)
+             (%copy-ivector-to-ivector-postincrement-8bit src (the fixnum (+ src-byte-offset fullword-size)) dest (the fixnum (+ dest-byte-offset fullword-size)) tail-size))))
+        (t
+         (let* ((prefix-size (- 4 (logand src-byte-offset 3))))
+           (declare (fixnum prefix-size))
+           (unless (= 4 prefix-size)
+             (%copy-ivector-to-ivector-postincrement-8bit src src-byte-offset dest dest-byte-offset prefix-size)
+             (incf src-byte-offset prefix-size)
+             (incf dest-byte-offset prefix-size)
+             (decf nbytes prefix-size)))
+         (let* ((tail-size (logand nbytes 3))
+                (fullword-size (- nbytes tail-size)))
+           (declare (fixnum tail-size fullword-size))
+           (unless (zerop fullword-size)
+             (%copy-ivector-to-ivector-postincrement-32bit src src-byte-offset dest dest-byte-offset fullword-size))
+           (unless (zerop tail-size)
+             (%copy-ivector-to-ivector-postincrement-8bit src (the fixnum (+ src-byte-offset fullword-size)) dest (the fixnum (+ dest-byte-offset fullword-size)) tail-size))))))
+
+(defun %copy-ivector-to-ivector-predecrement (src src-byte-offset dest dest-byte-offset nbytes)
+  (declare (fixnum src-byte-offset dest-byte-offset nbytes))
+  (cond ((or (< nbytes 8)
+             (not (= (logand src-byte-offset 3)
+                     (logand dest-byte-offset 3))))
+         (%copy-ivector-to-ivector-predecrement-8bit src src-byte-offset dest dest-byte-offset nbytes))
+        ((and (>= nbytes 80)
+              (= (logand src-byte-offset 15)
+                 (logand dest-byte-offset 15)))
+      (let* ((suffix-size (logand src-byte-offset 15)))
+        (declare (fixnum suffix-size))
+        (unless (zerop suffix-size)
+          (%copy-ivector-to-ivector-predecrement-8bit src src-byte-offset dest dest-byte-offset suffix-size)
+          (decf src-byte-offset suffix-size)
+          (decf dest-byte-offset suffix-size)
+          (decf nbytes suffix-size)))
+      (let* ((head-size (logand nbytes 15))
+             (fullword-size (- nbytes head-size)))
+        (declare (fixnum head-size fullword-size))
+        (unless (zerop fullword-size)
+          (%copy-ivector-to-ivector-predecrement-128bit src src-byte-offset dest dest-byte-offset fullword-size))
+        (unless (zerop head-size)
+          (%copy-ivector-to-ivector-predecrement-8bit src (the fixnum (- src-byte-offset fullword-size)) dest (the fixnum (- dest-byte-offset fullword-size)) head-size))))
+    ((= (logand src-byte-offset 7) (logand dest-byte-offset 7))
+      (let* ((suffix-size (logand src-byte-offset 7)))
+        (declare (fixnum suffix-size))
+        (unless (zerop suffix-size)
+          (%copy-ivector-to-ivector-predecrement-8bit src src-byte-offset dest dest-byte-offset suffix-size)
+          (decf src-byte-offset suffix-size)
+          (decf dest-byte-offset suffix-size)
+          (decf nbytes suffix-size)))
+      (let* ((head-size (logand nbytes 7))
+             (fullword-size (- nbytes head-size)))
+        (declare (fixnum head-size fullword-size))
+        (unless (zerop fullword-size)
+          (%copy-ivector-to-ivector-predecrement-64bit src src-byte-offset dest dest-byte-offset fullword-size))
+        (unless (zerop head-size)
+          (%copy-ivector-to-ivector-predecrement-8bit src (the fixnum (- src-byte-offset fullword-size)) dest (the fixnum (- dest-byte-offset fullword-size)) head-size))))
+    (t
+      (let* ((suffix-size (logand src-byte-offset 3)))
+        (declare (fixnum suffix-size))
+        (unless (zerop suffix-size)
+          (%copy-ivector-to-ivector-predecrement-8bit src src-byte-offset dest dest-byte-offset suffix-size)
+          (decf src-byte-offset suffix-size)
+          (decf dest-byte-offset suffix-size)
+          (decf nbytes suffix-size)))
+      (let* ((head-size (logand nbytes 3))
+             (fullword-size (- nbytes head-size)))
+        (declare (fixnum head-size fullword-size))
+        (unless (zerop fullword-size)
+          (%copy-ivector-to-ivector-predecrement-32bit src src-byte-offset dest dest-byte-offset fullword-size))
+        (unless (zerop head-size)
+          (%copy-ivector-to-ivector-predecrement-8bit src (the fixnum (- src-byte-offset fullword-size)) dest (the fixnum (- dest-byte-offset fullword-size)) head-size))))))
+
+(defx86lapfunction %copy-ivector-to-ivector-postincrement-8bit ((src 16) (src-byte-offset 8) #||(ra 0)||# (dest arg_x) (dest-byte-offset arg_y) (nbytes arg_z))
   (let ((rsrc temp0)
-         (ridx imm1)
-         (rval imm0))
-    (xorl (%l ridx) (%l ridx))
+        (srcidx imm0)
+        (destidx imm1)
+        (data imm2))
     (movq (@ src (% rsp)) (% rsrc))
-    (jmp @word-test)
-    @word-loop
-    (movq (@ x8664::misc-data-offset (% rsrc) (% ridx)) (% rval))
-    (movq (% rval) (@ x8664::misc-data-offset (% dest) (% ridx)))
-    (addq ($ 8) (% ridx))
-    @word-test
-    (cmpq (% ridx) (% nwords))
-    (jne @word-loop)
-    (jmp @byte-test)
-    @byte-loop
-    (movb (@ x8664::misc-data-offset (% rsrc) (% ridx)) (%b rval))
-    (movb (%b rval) (@ x8664::misc-data-offset (% dest) (% ridx)))
-    (addq ($ 1) (% ridx))
-    @byte-test
+    (movq (@ src-byte-offset (% rsp)) (% srcidx))
+    (sarq ($ target::fixnumshift) (% srcidx))
+    (movq (% dest-byte-offset) (% destidx))
+    (sarq ($ target::fixnumshift) (% destidx))
+    (jmp @test)
+    @loop
+    (movzbl (@ target::misc-data-offset (% rsrc) (% srcidx)) (%l data))
+    (movb (%b data) (@ target::misc-data-offset (% dest) (% destidx)))
+    (lea (@ 1 (% destidx)) (% destidx))
+    (lea (@ 1 (% srcidx)) (% srcidx))
+    @test
     (subq ($ '1) (% nbytes))
-    (jns @byte-loop)
-    (movq (% dest) (% arg_z))
-    (single-value-return 3)))
-          
-    
-    
-
-(defx86lapfunction %copy-ivector-to-ivector-bytes ((src-offset 16) 
-                                                   (src-byte-offset 8)
-                                                   #|(ra 0)|#
-                                                   (dest arg_x)
-                                                   (dest-byte-offset arg_y)
-                                                   (nbytes arg_z))
-  (let ((rsrc temp0)
-        (rsrc-byte-offset temp1))
-    (movq (@ src-byte-offset (% rsp)) (% rsrc-byte-offset))
-    (movq (@ src-offset (% rsp)) (% rsrc))
-    (cmpq (% dest) (% rsrc))
-    (jne @front)
-    (cmpq (% rsrc-byte-offset) (% dest-byte-offset))
-    (jg @back)
-    @front
-    (testq (% nbytes) (% nbytes))
-    (jmp @front-test)
-    @front-loop
-    (unbox-fixnum rsrc-byte-offset imm0)
-    (addq ($ '1) (% rsrc-byte-offset))
-    (movb (@ x8664::misc-data-offset (% rsrc) (% imm0)) (%b imm0))
-    (unbox-fixnum dest-byte-offset imm1)
-    (addq ($ '1) (% dest-byte-offset))
-    (movb (%b imm0) (@ x8664::misc-data-offset (% dest) (% imm1)))
-    (subq ($ '1) (% nbytes))
-    @front-test
-    (jne @front-loop)
-    (movq (% dest) (% arg_z))
-    (single-value-return 4)
-    @back
-    (addq (% nbytes) (% rsrc-byte-offset))
-    (addq (% nbytes) (% dest-byte-offset))
-    (testq (% nbytes) (% nbytes))
-    (jmp @back-test)
-    @back-loop
-    (subq ($ '1) (% rsrc-byte-offset))
-    (unbox-fixnum rsrc-byte-offset imm0)
-    (movb (@ x8664::misc-data-offset (% rsrc) (% imm0)) (%b imm0))
-    (subq ($ '1) (% dest-byte-offset))
-    (unbox-fixnum dest-byte-offset imm1)
-    (subq ($ '1) (% nbytes))
-    (movb (%b imm0) (@ x8664::misc-data-offset (% dest) (% imm1)))
-    @back-test
-    (jne @back-loop)
+    (jge @loop)
     (movq (% dest) (% arg_z))
     (single-value-return 4)))
-  
+
+
+
+(defx86lapfunction %copy-ivector-to-ivector-predecrement-8bit ((src 16) (src-byte-offset 8) #||(ra 0)||# (dest arg_x) (dest-byte-offset arg_y) (nbytes arg_z))
+  (let ((rsrc temp0)
+        (srcidx imm0)
+        (destidx imm1)
+        (data imm2))
+    (movq (@ src (% rsp)) (% rsrc))
+    (movq (@ src-byte-offset (% rsp)) (% srcidx))
+    (sarq ($ target::fixnumshift) (% srcidx))
+    (movq (% dest-byte-offset) (% destidx))
+    (sarq ($ target::fixnumshift) (% destidx))
+    (jmp @test)
+    @loop
+    (lea (@ -1 (% destidx)) (% destidx))
+    (lea (@ -1 (% srcidx)) (% srcidx))
+    (movzbl (@ target::misc-data-offset (% rsrc) (% srcidx)) (%l data))
+    (movb (%b data) (@ target::misc-data-offset (% dest) (% destidx)))
+    @test
+    (subq ($ '1) (% nbytes))
+    (jge @loop)
+    (movq (% dest) (% arg_z))
+    (single-value-return 4)))
+
+
+(defx86lapfunction %copy-ivector-to-ivector-postincrement-32bit ((src 16) (src-byte-offset 8) #||(ra 0)||# (dest arg_x) (dest-byte-offset arg_y) (nbytes arg_z))
+  (let ((rsrc temp0)
+        (srcidx imm0)
+        (destidx imm1)
+        (data imm2))
+    (movq (@ src (% rsp)) (% rsrc))
+    (movq (@ src-byte-offset (% rsp)) (% srcidx))
+    (sarq ($ target::fixnumshift) (% srcidx))
+    (movq (% dest-byte-offset) (% destidx))
+    (sarq ($ target::fixnumshift) (% destidx))
+    (jmp @test)
+    @loop
+    (movl (@ target::misc-data-offset (% rsrc) (% srcidx)) (%l data))
+    (movl (%l data) (@ target::misc-data-offset (% dest) (% destidx)))
+    (lea (@ 4 (% destidx)) (% destidx))
+    (lea (@ 4 (% srcidx)) (% srcidx))
+    @test
+    (subq ($ '4) (% nbytes))
+    (jge @loop)
+    (movq (% dest) (% arg_z))
+    (single-value-return 4)))
+
+(defx86lapfunction %copy-ivector-to-ivector-predecrement-32bit ((src 16) (src-byte-offset 8) #||(ra 0)||# (dest arg_x) (dest-byte-offset arg_y) (nbytes arg_z))
+  (let ((rsrc temp0)
+        (srcidx imm0)
+        (destidx imm1)
+        (data imm2))
+    (movq (@ src (% rsp)) (% rsrc))
+    (movq (@ src-byte-offset (% rsp)) (% srcidx))
+    (sarq ($ target::fixnumshift) (% srcidx))
+    (movq (% dest-byte-offset) (% destidx))
+    (sarq ($ target::fixnumshift) (% destidx))
+    (jmp @test)
+    @loop
+    (lea (@ -4 (% destidx)) (% destidx))
+    (lea (@ -4 (% srcidx)) (% srcidx))
+    (movl (@ target::misc-data-offset (% rsrc) (% srcidx)) (%l data))
+    (movl (%l data) (@ target::misc-data-offset (% dest) (% destidx)))
+    @test
+    (subq ($ '4) (% nbytes))
+    (jge @loop)
+    (movq (% dest) (% arg_z))
+    (single-value-return 4)))
+
+(defx86lapfunction %copy-ivector-to-ivector-postincrement-64bit ((src 16) (src-byte-offset 8) #||(ra 0)||# (dest arg_x) (dest-byte-offset arg_y) (nbytes arg_z))
+  (let ((rsrc temp0)
+        (srcidx temp1)
+        (destidx dest-byte-offset)
+        (data0 imm0)
+        (data1 imm1))
+    (movq (@ src (% rsp)) (% rsrc))
+    (movq (@ src-byte-offset (% rsp)) (% srcidx))
+    ;; srcidx and destidx are multiples of 8, so it's safe to right-shift
+    ;; them here (they remain fixnums).
+    (sarq ($ target::word-shift) (% srcidx))
+    (sarq ($ target::word-shift) (% destidx))
+    (testq ($ '8) (% nbytes))
+    (jz @test)
+    (movq (@ target::misc-data-offset (% rsrc) (% srcidx)) (% data0))
+    (movq (% data0) (@ target::misc-data-offset (% dest) (% destidx)))
+    (lea (@ 8 (% destidx)) (% destidx))
+    (lea (@ 8 (% srcidx)) (% srcidx))
+    (subq ($ '8) (% nbytes))    
+    (jmp @test)
+    @loop
+    (movq (@ target::misc-data-offset (% rsrc) (% srcidx)) (% data0))
+    (movq (@ (+ 8 target::misc-data-offset) (% rsrc) (% srcidx)) (% data1))
+    (movq (% data0) (@ target::misc-data-offset (% dest) (% destidx)))
+    (movq (% data1) (@ (+ 8 target::misc-data-offset) (% dest) (% destidx)))
+    (lea (@ 16 (% destidx)) (% destidx))
+    (lea (@ 16 (% srcidx)) (% srcidx))
+    @test
+    (subq ($ '16) (% nbytes))
+    (jge @loop)
+    (movq (% dest) (% arg_z))
+    (single-value-return 4)))
+
+(defx86lapfunction %copy-ivector-to-ivector-predecrement-64bit ((src 16) (src-byte-offset 8) #||(ra 0)||# (dest arg_x) (dest-byte-offset arg_y) (nbytes arg_z))
+  (let ((rsrc temp0)
+        (srcidx temp1)
+        (destidx dest-byte-offset)
+        (data0 imm0)
+        (data1 imm1))
+    (movq (@ src (% rsp)) (% rsrc))
+    (movq (@ src-byte-offset (% rsp)) (% srcidx))
+    ;; srcidx and destidx are multiples of 8, so it's safe to right-shift
+    ;; them here (they remain fixnums).
+    (sarq ($ target::word-shift) (% srcidx))
+    (sarq ($ target::word-shift) (% destidx))
+    (testq ($ '8) (% nbytes))
+    (jz @test)
+    (lea (@ -8 (% destidx)) (% destidx))
+    (lea (@ -8 (% srcidx)) (% srcidx))
+    (movq (@ target::misc-data-offset (% rsrc) (% srcidx)) (% data0))
+    (movq (% data0) (@ target::misc-data-offset (% dest) (% destidx)))
+    (subq ($ '8) (% nbytes))    
+    (jmp @test)
+    @loop
+    (lea (@ -16 (% destidx)) (% destidx))
+    (lea (@ -16 (% srcidx)) (% srcidx))
+    (movq (@ target::misc-data-offset (% rsrc) (% srcidx)) (% data0))
+    (movq (@ (+ 8 target::misc-data-offset) (% rsrc) (% srcidx)) (% data1))
+    (movq (% data0) (@ target::misc-data-offset (% dest) (% destidx)))
+    (movq (% data1) (@ (+ 8 target::misc-data-offset) (% dest) (% destidx)))
+    @test
+    (subq ($ '16) (% nbytes))
+    (jge @loop)
+    (movq (% dest) (% arg_z))
+    (single-value-return 4)))
+
+(defx86lapfunction %copy-ivector-to-ivector-postincrement-128bit ((src 16) (src-byte-offset 8) #||(ra 0)||# (dest arg_x) (dest-byte-offset arg_y) (nbytes arg_z))
+  (let ((rsrc temp0)
+        (srcidx imm0)
+        (destidx imm1))
+    (movq (@ src (% rsp)) (% rsrc))
+    (movq (@ src-byte-offset (% rsp)) (% srcidx))
+    (sarq ($ target::fixnumshift) (% srcidx))
+    (movq (% dest-byte-offset) (% destidx))
+    (sarq ($ target::fixnumshift) (% destidx))
+    (jmp @test)
+    @loop
+    (movdqa (@ target::misc-data-offset (% rsrc) (% srcidx)) (% xmm0))
+    (movdqa (@ (+ 16 target::misc-data-offset) (% rsrc) (% srcidx)) (% xmm1))
+    (movdqa (@ (+ 32 target::misc-data-offset) (% rsrc) (% srcidx)) (% xmm2))
+    (movdqa (@ (+ 48 target::misc-data-offset) (% rsrc) (% srcidx)) (% xmm3))
+    (movdqa (% xmm0) (@ target::misc-data-offset (% dest) (% destidx)))
+    (movdqa (% xmm1) (@ (+ 16 target::misc-data-offset) (% dest) (% destidx)))
+    (movdqa (% xmm2) (@ (+ 32 target::misc-data-offset) (% dest) (% destidx)))
+    (movdqa (% xmm3) (@ (+ 48 target::misc-data-offset) (% dest) (% destidx)))
+    (lea (@ 64 (% destidx)) (% destidx))
+    (lea (@ 64 (% srcidx)) (% srcidx))
+    (subq ($ '64) (% nbytes))
+    @test
+    (cmpq ($ '64) (% nbytes))
+    (jge @loop)
+    (testq (% nbytes) (% nbytes))
+    (je @done)
+    (cmpq ($ '32) (% nbytes))
+    (je @two)
+    (jl @one)
+    (movdqa (@ target::misc-data-offset (% rsrc) (% srcidx)) (% xmm0))
+    (movdqa (% xmm0) (@ target::misc-data-offset (% dest) (% destidx)))
+    (lea (@ 16 (% srcidx)) (% srcidx))
+    (lea (@ 16 (% destidx)) (% destidx))
+    @two
+    (movdqa (@ target::misc-data-offset (% rsrc) (% srcidx)) (% xmm0))
+    (movdqa (% xmm0) (@ target::misc-data-offset (% dest) (% destidx)))
+    (lea (@ 16 (% srcidx)) (% srcidx))
+    (lea (@ 16 (% destidx)) (% destidx))
+    @one
+    (movdqa (@ target::misc-data-offset (% rsrc) (% srcidx)) (% xmm0))
+    (movdqa (% xmm0) (@ target::misc-data-offset (% dest) (% destidx)))
+    @done
+    (movq (% dest) (% arg_z))
+    (single-value-return 4)))
+
+(defx86lapfunction %copy-ivector-to-ivector-predecrement-128bit ((src 16) (src-byte-offset 8) #||(ra 0)||# (dest arg_x) (dest-byte-offset arg_y) (nbytes arg_z))
+  (let ((rsrc temp0)
+        (srcidx imm0)
+        (destidx imm1))
+    (movq (@ src (% rsp)) (% rsrc))
+    (movq (@ src-byte-offset (% rsp)) (% srcidx))
+    (sarq ($ target::fixnumshift) (% srcidx))
+    (movq (% dest-byte-offset) (% destidx))
+    (sarq ($ target::fixnumshift) (% destidx))
+    (jmp @test)
+    @loop
+    (lea (@ -64 (% destidx)) (% destidx))
+    (lea (@ -64 (% srcidx)) (% srcidx))
+    (movdqa (@ target::misc-data-offset (% rsrc) (% srcidx)) (% xmm0))
+    (movdqa (@ (+ 16 target::misc-data-offset) (% rsrc) (% srcidx)) (% xmm1))
+    (movdqa (@ (+ 32 target::misc-data-offset) (% rsrc) (% srcidx)) (% xmm2))
+    (movdqa (@ (+ 48 target::misc-data-offset) (% rsrc) (% srcidx)) (% xmm3))
+    (movdqa (% xmm0) (@ target::misc-data-offset (% dest) (% destidx)))
+    (movdqa (% xmm1) (@ (+ 16 target::misc-data-offset) (% dest) (% destidx)))
+    (movdqa (% xmm2) (@ (+ 32 target::misc-data-offset) (% dest) (% destidx)))
+    (movdqa (% xmm3) (@ (+ 48 target::misc-data-offset) (% dest) (% destidx)))
+    (subq ($ '64) (% nbytes))
+    @test
+    (cmpq ($ '64) (% nbytes))
+    (jge @loop)
+    (testq (% nbytes) (% nbytes))
+    (je @done)
+    (cmpq ($ '32) (% nbytes))
+    (je @two)
+    (jl @one)
+    (lea (@ -16 (% srcidx)) (% srcidx))
+    (lea (@ -16 (% destidx)) (% destidx))
+    (movdqa (@ target::misc-data-offset (% rsrc) (% srcidx)) (% xmm0))
+    (movdqa (% xmm0) (@ target::misc-data-offset (% dest) (% destidx)))
+    @two
+    (lea (@ -16 (% srcidx)) (% srcidx))
+    (lea (@ -16 (% destidx)) (% destidx))
+    (movdqa (@ target::misc-data-offset (% rsrc) (% srcidx)) (% xmm0))
+    (movdqa (% xmm0) (@ target::misc-data-offset (% dest) (% destidx)))
+    @one
+    (lea (@ -16 (% srcidx)) (% srcidx))
+    (lea (@ -16 (% destidx)) (% destidx))
+    (movdqa (@ target::misc-data-offset (% rsrc) (% srcidx)) (% xmm0))
+    (movdqa (% xmm0) (@ target::misc-data-offset (% dest) (% destidx)))
+    @done
+    (movq (% dest) (% arg_z))
+    (single-value-return 4)))
+
 
 (defx86lapfunction %copy-gvector-to-gvector ((src (* 2 x8664::node-size))
