Index: /branches/working-0711/ccl/level-1/l1-unicode.lisp
===================================================================
--- /branches/working-0711/ccl/level-1/l1-unicode.lisp	(revision 9630)
+++ /branches/working-0711/ccl/level-1/l1-unicode.lisp	(revision 9631)
@@ -77,6 +77,7 @@
   octets-in-string-function              ;(STRING START END)
 
-  ;; Returns the number of (full) characters encoded in VECTOR, and the index
-  ;; of the first octet not used to encode them. (The second value may be less than END).
+  ;; Returns the number of (full) characters encoded in VECTOR, and
+  ;; the index the index of the first octet not used to encode
+  ;; them. (The second value may be less than END.
   length-of-vector-encoding-function    ;(VECTOR START END) 
 
@@ -113,6 +114,5 @@
 (defconstant byte-order-mark #\u+feff)
 (defconstant byte-order-mark-char-code (char-code byte-order-mark))
-(defconstant swapped-byte-order-mark #\u+fffe)
-(defconstant swapped-byte-order-mark-char-code (char-code swapped-byte-order-mark))
+(defconstant swapped-byte-order-mark-char-code #xfffe)
 
 
@@ -179,9 +179,9 @@
   (if (>= end start)
     (values (- end start) end)
-    (values 0 0)))
+    (values 0 start)))
 
 (defun 8-bit-fixed-width-length-of-memory-encoding (pointer noctets start)
   (declare (ignore pointer start))
-  noctets)
+  (values noctets noctets))
 
 (define-character-encoding :iso-8859-1
@@ -2989,5 +2989,5 @@
                                     (or (>= 1st-unit #xe1)
                                         (>= s1 #xa0)))
-                             (code-char (the fixnum
+                             (or (code-char (the fixnum
                                           (logior (the fixnum
                                                     (ash (the fixnum (logand 1st-unit #xf))
@@ -2999,4 +2999,5 @@
                                                             6))
                                                      (the fixnum (logand s2 #x3f)))))))
+                                 #\Replacement_Character)
                              #\Replacement_Character)
                            (if (< 1st-unit #xf8)
@@ -3063,5 +3064,6 @@
                    (setf (aref vector (the fixnum (+ idx 2)))
                          (logior #x80 (the fixnum (logand #x3f (the fixnum (ash code -6))))))
-                   (setf (aref vector (the fixnum (+ idx 3))) (logand #x3f code))
+                   (setf (aref vector (the fixnum (+ idx 3)))
+                         (logior #x80 (logand #x3f code)))
                    (incf idx 4)))))))
     :vector-decode-function
@@ -3141,11 +3143,12 @@
              (nchars 0))
             ((>= i end)
-             (if (= i end) (values nchars i)))
+             (values nchars i))
          (declare (fixnum i))
          (let* ((code (aref vector i))
-                (nexti (+ i (cond ((< code #x80) 1)
+                (nexti (+ i (cond ((< code #xc2) 1)
                                   ((< code #xe0) 2)
                                   ((< code #xf0) 3)
-                                  (t 4)))))
+                                  ((< code #xf8) 4)
+                                  (t 1)))))
            (declare (type (unsigned-byte 8) code))
            (if (> nexti end)
@@ -3332,5 +3335,5 @@
              (index idx))
             ((= index end) index)
-         (declare (fixnum i end index))
+         (declare (fixnum i len index))
          (let* ((1st-unit (%native-u8-ref-u16 vector index)))
            (declare (type (unsigned-byte 16) 1st-unit))
@@ -3376,5 +3379,5 @@
              (index idx))
             ((>= index end) index)
-         (declare (fixnum i index end))
+         (declare (fixnum i index p))
          (let* ((1st-unit (%get-unsigned-word pointer index)))
            (declare (type (unsigned-byte 16) 1st-unit))
@@ -3496,5 +3499,5 @@
            (index idx))
           ((= index end) index)
-       (declare (fixnum i end index))
+       (declare (fixnum i len index))
        (let* ((1st-unit (%reversed-u8-ref-u16 vector index)))
          (declare (type (unsigned-byte 16) 1st-unit))
@@ -3540,5 +3543,5 @@
            (index idx))
           ((>= index end) index)
-       (declare (fixnum i index end))
+       (declare (fixnum i index p))
        (let* ((1st-unit (%swap-u16 (%get-unsigned-word pointer index))))
          (declare (type (unsigned-byte 16) 1st-unit))
@@ -3638,4 +3641,6 @@
      (declare (type (simple-array (unsigned-byte 8) (*)) vector)
               (fixnum idx))
+     (setf (%native-u8-ref-u16 vector idx) byte-order-mark-char-code)
+     (incf idx 2)
      (do* ((i start (1+ i)))
             ((>= i end) idx)
@@ -3662,5 +3667,6 @@
      (declare (type (simple-array (unsigned-byte 16) (*)) vector)
               (type index idx))
-     (let* ((swap (if (>= noctets 2)
+     (let* ((origin idx)
+            (swap (if (>= noctets 2)
                     (case (%native-u8-ref-u16 vector idx)
                       (#.byte-order-mark-char-code
@@ -3670,8 +3676,8 @@
                       (t #+little-endian-target t)))))
        (do* ((i 0 (1+ i))
-             (end (+ idx noctets))
+             (end (+ origin noctets))
              (index idx))
             ((= index end) index)
-         (declare (fixnum i end index))
+         (declare (fixnum i len index))
          (let* ((1st-unit (if swap
                             (%reversed-u8-ref-u16 vector index)
@@ -3698,4 +3704,7 @@
    (lambda (string pointer idx start end)
      (declare (fixnum idx))
+     ;; Output a BOM.
+     (setf (%get-unsigned-word pointer idx) byte-order-mark-char-code)
+     (incf idx 2)
      (do* ((i start (1+ i)))
           ((>= i end) idx)
@@ -3703,23 +3712,18 @@
               (highbits (- code #x10000)))
          (declare (type (mod #x110000) code)
-                  (fixnum highbits))
+                  (fixnum p highbits))
          (cond ((< highbits 0)
-                (setf (%get-unsigned-word pointer idx) #+big-endian-target code #+little-endian-target (%swap-u16 code))
+                (setf (%get-unsigned-word pointer idx) code)
                 (incf idx 2))
                (t
-                (let* ((w1 (logior #xd800 (the fixnum (ash highbits -10))))
-                       (w2 (logior #xdc00 (the fixnum (logand highbits #x3ff)))))
-                  (declare (type (unsigned-byte 16) w1 w2))
-                (setf (%get-unsigned-word pointer idx)
-                      #+big-endian-target w1 #+little-endian-target (%swap-u16 w1))
+                (setf (%get-unsigned-word pointer idx) (logior #xd800 (the fixnum (ash highbits -10))))
                 (setf (%get-unsigned-word pointer (the fixnum (+ idx 2)))
-                      #+big-endian-target w2
-                      #+little-endian-target (%swap-u16 w2))
-                (incf idx 4))))))))
+                      (logior #xdc00 (the fixnum (logand highbits #x3ff))))
+                (incf idx 4)))))))
   :memory-decode-function
   (nfunction
    utf-16-memory-decode
    (lambda (pointer noctets idx string)
-     (declare (fixnum noctets idx))
+     (declare (fixnum nunits idx))
      (let* ((swap (when (> noctets 1)
                     (case (%get-unsigned-word pointer idx)
@@ -3737,5 +3741,5 @@
              (index idx ))
             ((>= index end) index)
-         (declare (fixnum i index end))
+         (declare (fixnum i index p))
          (let* ((1st-unit (%get-unsigned-word pointer index)))
            (declare (type (unsigned-byte 16) 1st-unit))
@@ -3756,5 +3760,8 @@
              (setf (schar string i) (or char #\Replacement_Character))))))))
   :octets-in-string-function
-  #'utf-16-octets-in-string
+  (nfunction
+   utf-16-bom-octets-in-string
+   (lambda (string start end)
+     (+ 2 (utf-16-octets-in-string string start end))))
   :length-of-vector-encoding-function
   (nfunction
@@ -3762,5 +3769,5 @@
    (lambda (vector start end)
      (declare (type (simple-array (unsigned-byte 16) (*)) vector))
-     (let* ((swap (when (> end start)
+     (let* ((swap (when (>= end (+ start 2))
                     (case (%native-u8-ref-u16 vector start)
                       (#.byte-order-mark-char-code
@@ -3775,5 +3782,5 @@
              (nchars 0))
             ((> j end)
-             (if (= i end) (values nchars i)))
+             (values nchars i))
          (let* ((code (if swap
                         (%reversed-u8-ref-u16 vector i)
@@ -3792,28 +3799,35 @@
    utf-16-length-of-memory-encoding
    (lambda (pointer noctets start)
-     (let* ((swap (when (>= noctets 2)
+     (declare (fixnum noctets start))
+     (when (oddp noctets)
+       (setq noctets (1- noctets)))
+     (let* ((origin start)
+            (swap (when (>= noctets 2)
                     (case (%get-unsigned-word pointer (+ start start))
                       (#.byte-order-mark-char-code
                        (incf start 2)
-                       (decf noctets 2)
                        nil)
                       (#.swapped-byte-order-mark-char-code
                        (incf start 2)
-                       (decf noctets 2)
                        t)
                       (t #+little-endian-target t)))))
+       (declare (fixnum origin))
        (do* ((i start)
              (j (+ i 2) (+ i 2))
-             (end (+ start noctets))
+             (end (+ origin noctets))
              (nchars 0 (1+ nchars)))
-            ((> j end) (values nchars i))
+            ((> j end) (values nchars (- i origin)))
+         (declare (fixnum (i j end nchars)))
          (let* ((code (%get-unsigned-word pointer i)))
            (declare (type (unsigned-byte 16) code))
            (if swap (setq code (%swap-u16 code)))
-           (incf i
-                 (if (or (< code #xd800)
-                         (>= code #xdc00))
-                   2
-                   4)))))))
+           (let* ((nexti (+ i (if (or (< code #xd800)
+                                      (>= code #xdc00))
+                                2
+                                4))))
+             (declare (fixnum nexti))
+             (if (> nexti end)
+               (return (values nchars (- i origin)))
+               (setq i nexti))))))))
   :decode-literal-code-unit-limit #xd800
   :encode-literal-char-code-limit #x10000  
@@ -3847,4 +3861,14 @@
     (* 2 (- end start))
     0))
+
+(defun ucs-2-length-of-vector-encoding (vector start end)
+  (declare (ignore vector))
+  (let* ((noctets (max (- end start) 0)))
+    (values (ash noctets -1) (+ start (logandc2 noctets 1)))))
+
+(defun ucs-2-length-of-memory-encoding (pointer noctets start)
+  (declare (ignore pointer start))
+  (values (ash noctets -1) (logandc2 noctets 1)))
+
 
 
@@ -3895,5 +3919,5 @@
            (index idx (+ 2 index)))
           ((>= index end) index)
-       (declare (fixnum i end index))
+       (declare (fixnum i len index))
        (setf (schar string i)
              (or (code-char (%native-u8-ref-u16 vector index))
@@ -3928,18 +3952,7 @@
   #'ucs-2-octets-in-string
   :length-of-vector-encoding-function
-  (nfunction
-   native-ucs-2-length-of-vector-encoding
-   (lambda (vector start end)
-     (declare (ignore vector))
-     (do* ((i start (1+ i))
-           (j (+ i 2) (+ i 2))
-           (nchars 0 (1+ nchars)))
-          ((> j end) (values nchars i)))))
+  #'ucs-2-length-of-vector-encoding
   :length-of-memory-encoding-function
-  (nfunction
-   native-ucs-2-length-of-memory-encoding
-   (lambda (pointer noctets start)
-     (declare (ignore pointer))
-     (values (floor noctets 2) (+ start noctets))))
+  #'ucs-2-length-of-memory-encoding
   :decode-literal-code-unit-limit #x10000
   :encode-literal-char-code-limit #x10000  
@@ -3994,5 +4007,5 @@
            (index idx (+ 2 index)))
           ((>= index end) index)
-       (declare (fixnum i end index))
+       (declare (fixnum i len index))
        (setf (schar string i)
              (or (code-char (%reversed-u8-ref-u16 vector index))
@@ -4027,18 +4040,7 @@
   #'ucs-2-octets-in-string
   :length-of-vector-encoding-function
-  (nfunction
-   reversed-ucs-2-length-of-vector-encoding
-   (lambda (vector start end)
-     (declare (ignore vector))
-     (do* ((i start (1+ i))
-           (j (+ i 2) (+ i 2))
-           (nchars 0 (1+ nchars)))
-          ((> j end) (values nchars i)))))
+  #'ucs-2-length-of-vector-encoding
   :length-of-memory-encoding-function
-  (nfunction
-   reversed-ucs-2-length-of-memory-encoding
-   (lambda (pointer noctets start)
-     (declare (ignore pointer))
-     (values (floor noctets 2) (+ start noctets))))
+  #'ucs-2-length-of-memory-encoding
   :decode-literal-code-unit-limit #x10000
   :encode-literal-char-code-limit #x10000
@@ -4066,4 +4068,6 @@
      (declare (type (simple-array (unsigned-byte 8) (*)) vector)
               (fixnum idx))
+     (setf (%native-u8-ref-u16 vector idx) byte-order-mark-char-code)
+     (incf idx 2)
      (do* ((i start (1+ i)))
           ((>= i end) idx)
@@ -4094,5 +4098,5 @@
              (index idx (1+ index)))
             ((>= index end) index)
-         (declare (fixnum i end index))
+         (declare (fixnum i len index))
          (let* ((1st-unit (if swap
                             (%reversed-u8-ref-u16 vector index)
@@ -4105,4 +4109,6 @@
    (lambda (string pointer idx start end)
      (declare (fixnum idx))
+     (setf (%get-unsigned-word pointer idx) byte-order-mark-char-code)
+     (incf idx 2)
      (do* ((i start (1+ i)))
           ((>= i end) idx)
@@ -4139,12 +4145,21 @@
          (setf (schar string i) (or (code-char 1st-unit) #\Replacement_Character)))))))
   :octets-in-string-function
-  #'ucs-2-octets-in-string
+  (nfunction
+   ucs-2-bom-octets-in-string
+   (lambda (string start end)
+     (+ 2 (ucs-2-octets-in-string string start end))))
   :length-of-vector-encoding-function
   (nfunction
    ucs-2-length-of-vector-encoding
    (lambda (vector start end)
-     (declare (ignore vector))
-     (do* ((i start (1+ i))
-           (j (+ i 2) (+ i 2))
+     (declare (fixnum start end))
+     (when (>= end (+ start 2))
+       (let* ((maybe-bom (%native-u8-ref-u16 vector start)))
+         (declare (type (unsigned-byte 16) maybe-bom))
+         (when (or (= maybe-bom byte-order-mark-char-code)
+                   (= maybe-bom swapped-byte-order-mark-char-code))
+           (incf start 2))))
+     (do* ((i start j)
+           (j (+ i 2) (+ j 2))
            (nchars 0 (1+ nchars)))
           ((> j end) (values nchars i)))))
@@ -4153,13 +4168,12 @@
    ucs-2-length-of-memory-encoding
    (lambda (pointer noctets start)
-     (when (> noctets 1)
-       (case (%get-unsigned-word pointer )
-         (#.byte-order-mark-char-code
-          (incf start 2)
-          (decf noctets 2))
-         (#.swapped-byte-order-mark-char-code
-          (incf start 2)
-          (decf noctets 2))))
-     (values (floor noctets 2) (+ start noctets))))
+     (let* ((skip 
+             (when (> noctets 1)
+               (case (%get-unsigned-word pointer start)
+                 (#.byte-order-mark-char-code
+                  2)
+                 (#.swapped-byte-order-mark-char-code
+                  2)))))
+     (values (ash (- noctets skip) -1) (logandc2 noctets 1)))))
   :decode-literal-code-unit-limit #x10000
   :encode-literal-char-code-limit #x10000  
@@ -4299,5 +4313,5 @@
            (index idx (+ 4 index)))
           ((>= index end) index)
-       (declare (fixnum i end index))
+       (declare (fixnum i len index))
        (let* ((code (%native-u8-ref-u32 vector index)))
          (declare (type (unsigned-byte 32) code))
@@ -4338,6 +4352,6 @@
    (lambda (vector start end)
      (declare (ignore vector))
-     (do* ((i start (1+ i))
-           (j (+ i 4) (+ i 4))
+     (do* ((i start j)
+           (j (+ i 4) (+ j 4))
            (nchars 0 (1+ nchars)))
           ((> j end) (values nchars i)))))
@@ -4347,5 +4361,5 @@
    (lambda (pointer noctets start)
      (declare (ignore pointer))
-     (values (floor noctets 4) (+ start noctets))))
+     (values (ash noctets -2) (+ start (logandc2 noctets 3)))))
   :decode-literal-code-unit-limit #x110000
   :encode-literal-char-code-limit #x110000
@@ -4396,5 +4410,5 @@
            (index idx (+ 4 index)))
           ((>= index end) index)
-       (declare (fixnum i end index))
+       (declare (fixnum i len index))
        (let* ((code (%reversed-u8-ref-u32 vector index)))
          (declare (type (unsigned-byte 32) code))
@@ -4436,6 +4450,6 @@
    (lambda (vector start end)
      (declare (ignore vector))
-     (do* ((i start (1+ i))
-           (j (+ i 4) (+ i 4))
+     (do* ((i start j)
+           (j (+ i 4) (+ j 4))
            (nchars 0 (1+ nchars)))
           ((> j end) (values nchars i)))))
@@ -4445,5 +4459,5 @@
    (lambda (pointer noctets start)
      (declare (ignore pointer))
-     (values (floor noctets 4) (+ start noctets))))
+     (values (ash noctets -2) (+ start (logandc2 noctets 3)))))
   :decode-literal-code-unit-limit #x110000
   :encode-literal-char-code-limit #x110000
@@ -4454,5 +4468,5 @@
     "A 32-bit, fixed-length encoding in which all Unicode characters can be encoded in a single 32-bit word.  The endianness of the encoded data is indicated by the endianness of a byte-order-mark character (#\u+feff) prepended to the data; in the absence of such a character on input, input data is assumed to be in big-endian order.  Output is written in native byte order with a leading byte-order mark."
     
-  :aliases '(:utf-4)
+  :aliases '(:ucs-4)
   :max-units-per-char 1
   :code-unit-size 32
@@ -4468,4 +4482,6 @@
      (declare (type (simple-array (unsigned-byte 8) (*)) vector)
               (fixnum idx))
+     (setf (%native-u8-ref-u32 vector idx) byte-order-mark-char-code)
+     (incf idx 4)
      (do* ((i start (1+ i)))
           ((>= i end) idx)
@@ -4494,5 +4510,5 @@
              (index idx (1+ index)))
             ((>= index end) index)
-         (declare (fixnum i end index))
+         (declare (fixnum i len index))
          (let* ((1st-unit (if swap
                             (%reversed-u8-ref-u32 vector index)
@@ -4507,5 +4523,6 @@
    (lambda (string pointer idx start end)
      (declare (fixnum idx))
-
+     (setf (%get-unsigned-long pointer idx) byte-order-mark-char-code)
+     (incf idx 4)
      (do* ((i start (1+ i)))
           ((>= i end) idx)
@@ -4541,12 +4558,20 @@
                                     #\Replacement_Character)))))))
   :octets-in-string-function
-  #'ucs-4-octets-in-string
+  (nfunction
+   utf-32-bom-octets-in-string
+   (lambda (string start end)
+     (+ 4 (ucs-4-octets-in-string string start end))))
   :length-of-vector-encoding-function
   (nfunction
    utf-32-length-of-vector-encoding
    (lambda (vector start end)
-     (declare (ignore vector))
-     (do* ((i start (1+ i))
-           (j (+ i 2) (+ i 2))
+     (when (>= end (+ start 4))
+       (let* ((maybe-bom (%native-u8-ref-u32 vector start)))
+         (declare (type (unsigned-byte 32) maybe-bom))
+         (when (or (= maybe-bom byte-order-mark-char-code)
+                   (= maybe-bom swapped-byte-order-mark-char-code))
+           (incf start 4))))
+     (do* ((i start j)
+           (j (+ i 4) (+ J 4))
            (nchars 0 (1+ nchars)))
           ((> j end) (values nchars i)))))
@@ -4555,5 +4580,5 @@
    utf-32-length-of-memory-encoding
    (lambda (pointer noctets start)
-     (when (> noctets 1)
+     (when (> noctets 3)
        (case (%get-unsigned-long pointer )
          (#.byte-order-mark-char-code
@@ -4563,5 +4588,5 @@
           (incf start 4)
           (decf noctets 4))))
-     (values (floor noctets 4) (+ start noctets))))
+     (values (ash noctets -2) (+ start (logandc2 noctets 3)))))
   :decode-literal-code-unit-limit #x110000
   :encode-literal-char-code-limit #x110000  
@@ -4604,5 +4629,5 @@
 (defvar *cr-newline-string* (make-string 1 :initial-element #\Return))
 (defvar *crlf-newline-string* (make-array 2 :element-type 'character :initial-contents '(#\Return #\Linefeed)))
-(defvar *nul-string (make-string 1 :initial-element #\Nul))
+(defvar *nul-string* (make-string 1 :initial-element #\Nul))
 
 (defun string-size-in-octets (string &key
