- Timestamp:
- May 30, 2008, 8:32:29 AM (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/working-0711/ccl/level-1/l1-unicode.lisp
r9620 r9631 77 77 octets-in-string-function ;(STRING START END) 78 78 79 ;; Returns the number of (full) characters encoded in VECTOR, and the index 80 ;; of the first octet not used to encode them. (The second value may be less than END). 79 ;; Returns the number of (full) characters encoded in VECTOR, and 80 ;; the index the index of the first octet not used to encode 81 ;; them. (The second value may be less than END. 81 82 length-of-vector-encoding-function ;(VECTOR START END) 82 83 … … 113 114 (defconstant byte-order-mark #\u+feff) 114 115 (defconstant byte-order-mark-char-code (char-code byte-order-mark)) 115 (defconstant swapped-byte-order-mark #\u+fffe) 116 (defconstant swapped-byte-order-mark-char-code (char-code swapped-byte-order-mark)) 116 (defconstant swapped-byte-order-mark-char-code #xfffe) 117 117 118 118 … … 179 179 (if (>= end start) 180 180 (values (- end start) end) 181 (values 0 0)))181 (values 0 start))) 182 182 183 183 (defun 8-bit-fixed-width-length-of-memory-encoding (pointer noctets start) 184 184 (declare (ignore pointer start)) 185 noctets)185 (values noctets noctets)) 186 186 187 187 (define-character-encoding :iso-8859-1 … … 2989 2989 (or (>= 1st-unit #xe1) 2990 2990 (>= s1 #xa0))) 2991 ( code-char (the fixnum2991 (or (code-char (the fixnum 2992 2992 (logior (the fixnum 2993 2993 (ash (the fixnum (logand 1st-unit #xf)) … … 2999 2999 6)) 3000 3000 (the fixnum (logand s2 #x3f))))))) 3001 #\Replacement_Character) 3001 3002 #\Replacement_Character) 3002 3003 (if (< 1st-unit #xf8) … … 3063 3064 (setf (aref vector (the fixnum (+ idx 2))) 3064 3065 (logior #x80 (the fixnum (logand #x3f (the fixnum (ash code -6)))))) 3065 (setf (aref vector (the fixnum (+ idx 3))) (logand #x3f code)) 3066 (setf (aref vector (the fixnum (+ idx 3))) 3067 (logior #x80 (logand #x3f code))) 3066 3068 (incf idx 4))))))) 3067 3069 :vector-decode-function … … 3141 3143 (nchars 0)) 3142 3144 ((>= i end) 3143 ( if (= i end) (values nchars i)))3145 (values nchars i)) 3144 3146 (declare (fixnum i)) 3145 3147 (let* ((code (aref vector i)) 3146 (nexti (+ i (cond ((< code #x 80) 1)3148 (nexti (+ i (cond ((< code #xc2) 1) 3147 3149 ((< code #xe0) 2) 3148 3150 ((< code #xf0) 3) 3149 (t 4))))) 3151 ((< code #xf8) 4) 3152 (t 1))))) 3150 3153 (declare (type (unsigned-byte 8) code)) 3151 3154 (if (> nexti end) … … 3332 3335 (index idx)) 3333 3336 ((= index end) index) 3334 (declare (fixnum i endindex))3337 (declare (fixnum i len index)) 3335 3338 (let* ((1st-unit (%native-u8-ref-u16 vector index))) 3336 3339 (declare (type (unsigned-byte 16) 1st-unit)) … … 3376 3379 (index idx)) 3377 3380 ((>= index end) index) 3378 (declare (fixnum i index end))3381 (declare (fixnum i index p)) 3379 3382 (let* ((1st-unit (%get-unsigned-word pointer index))) 3380 3383 (declare (type (unsigned-byte 16) 1st-unit)) … … 3496 3499 (index idx)) 3497 3500 ((= index end) index) 3498 (declare (fixnum i endindex))3501 (declare (fixnum i len index)) 3499 3502 (let* ((1st-unit (%reversed-u8-ref-u16 vector index))) 3500 3503 (declare (type (unsigned-byte 16) 1st-unit)) … … 3540 3543 (index idx)) 3541 3544 ((>= index end) index) 3542 (declare (fixnum i index end))3545 (declare (fixnum i index p)) 3543 3546 (let* ((1st-unit (%swap-u16 (%get-unsigned-word pointer index)))) 3544 3547 (declare (type (unsigned-byte 16) 1st-unit)) … … 3638 3641 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 3639 3642 (fixnum idx)) 3643 (setf (%native-u8-ref-u16 vector idx) byte-order-mark-char-code) 3644 (incf idx 2) 3640 3645 (do* ((i start (1+ i))) 3641 3646 ((>= i end) idx) … … 3662 3667 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 3663 3668 (type index idx)) 3664 (let* ((swap (if (>= noctets 2) 3669 (let* ((origin idx) 3670 (swap (if (>= noctets 2) 3665 3671 (case (%native-u8-ref-u16 vector idx) 3666 3672 (#.byte-order-mark-char-code … … 3670 3676 (t #+little-endian-target t))))) 3671 3677 (do* ((i 0 (1+ i)) 3672 (end (+ idxnoctets))3678 (end (+ origin noctets)) 3673 3679 (index idx)) 3674 3680 ((= index end) index) 3675 (declare (fixnum i endindex))3681 (declare (fixnum i len index)) 3676 3682 (let* ((1st-unit (if swap 3677 3683 (%reversed-u8-ref-u16 vector index) … … 3698 3704 (lambda (string pointer idx start end) 3699 3705 (declare (fixnum idx)) 3706 ;; Output a BOM. 3707 (setf (%get-unsigned-word pointer idx) byte-order-mark-char-code) 3708 (incf idx 2) 3700 3709 (do* ((i start (1+ i))) 3701 3710 ((>= i end) idx) … … 3703 3712 (highbits (- code #x10000))) 3704 3713 (declare (type (mod #x110000) code) 3705 (fixnum highbits))3714 (fixnum p highbits)) 3706 3715 (cond ((< highbits 0) 3707 (setf (%get-unsigned-word pointer idx) #+big-endian-target code #+little-endian-target (%swap-u16 code))3716 (setf (%get-unsigned-word pointer idx) code) 3708 3717 (incf idx 2)) 3709 3718 (t 3710 (let* ((w1 (logior #xd800 (the fixnum (ash highbits -10)))) 3711 (w2 (logior #xdc00 (the fixnum (logand highbits #x3ff))))) 3712 (declare (type (unsigned-byte 16) w1 w2)) 3713 (setf (%get-unsigned-word pointer idx) 3714 #+big-endian-target w1 #+little-endian-target (%swap-u16 w1)) 3719 (setf (%get-unsigned-word pointer idx) (logior #xd800 (the fixnum (ash highbits -10)))) 3715 3720 (setf (%get-unsigned-word pointer (the fixnum (+ idx 2))) 3716 #+big-endian-target w2 3717 #+little-endian-target (%swap-u16 w2)) 3718 (incf idx 4)))))))) 3721 (logior #xdc00 (the fixnum (logand highbits #x3ff)))) 3722 (incf idx 4))))))) 3719 3723 :memory-decode-function 3720 3724 (nfunction 3721 3725 utf-16-memory-decode 3722 3726 (lambda (pointer noctets idx string) 3723 (declare (fixnum n octets idx))3727 (declare (fixnum nunits idx)) 3724 3728 (let* ((swap (when (> noctets 1) 3725 3729 (case (%get-unsigned-word pointer idx) … … 3737 3741 (index idx )) 3738 3742 ((>= index end) index) 3739 (declare (fixnum i index end))3743 (declare (fixnum i index p)) 3740 3744 (let* ((1st-unit (%get-unsigned-word pointer index))) 3741 3745 (declare (type (unsigned-byte 16) 1st-unit)) … … 3756 3760 (setf (schar string i) (or char #\Replacement_Character)))))))) 3757 3761 :octets-in-string-function 3758 #'utf-16-octets-in-string 3762 (nfunction 3763 utf-16-bom-octets-in-string 3764 (lambda (string start end) 3765 (+ 2 (utf-16-octets-in-string string start end)))) 3759 3766 :length-of-vector-encoding-function 3760 3767 (nfunction … … 3762 3769 (lambda (vector start end) 3763 3770 (declare (type (simple-array (unsigned-byte 16) (*)) vector)) 3764 (let* ((swap (when (> end start)3771 (let* ((swap (when (>= end (+ start 2)) 3765 3772 (case (%native-u8-ref-u16 vector start) 3766 3773 (#.byte-order-mark-char-code … … 3775 3782 (nchars 0)) 3776 3783 ((> j end) 3777 ( if (= i end) (values nchars i)))3784 (values nchars i)) 3778 3785 (let* ((code (if swap 3779 3786 (%reversed-u8-ref-u16 vector i) … … 3792 3799 utf-16-length-of-memory-encoding 3793 3800 (lambda (pointer noctets start) 3794 (let* ((swap (when (>= noctets 2) 3801 (declare (fixnum noctets start)) 3802 (when (oddp noctets) 3803 (setq noctets (1- noctets))) 3804 (let* ((origin start) 3805 (swap (when (>= noctets 2) 3795 3806 (case (%get-unsigned-word pointer (+ start start)) 3796 3807 (#.byte-order-mark-char-code 3797 3808 (incf start 2) 3798 (decf noctets 2)3799 3809 nil) 3800 3810 (#.swapped-byte-order-mark-char-code 3801 3811 (incf start 2) 3802 (decf noctets 2)3803 3812 t) 3804 3813 (t #+little-endian-target t))))) 3814 (declare (fixnum origin)) 3805 3815 (do* ((i start) 3806 3816 (j (+ i 2) (+ i 2)) 3807 (end (+ startnoctets))3817 (end (+ origin noctets)) 3808 3818 (nchars 0 (1+ nchars))) 3809 ((> j end) (values nchars i)) 3819 ((> j end) (values nchars (- i origin))) 3820 (declare (fixnum (i j end nchars))) 3810 3821 (let* ((code (%get-unsigned-word pointer i))) 3811 3822 (declare (type (unsigned-byte 16) code)) 3812 3823 (if swap (setq code (%swap-u16 code))) 3813 (incf i 3814 (if (or (< code #xd800) 3815 (>= code #xdc00)) 3816 2 3817 4))))))) 3824 (let* ((nexti (+ i (if (or (< code #xd800) 3825 (>= code #xdc00)) 3826 2 3827 4)))) 3828 (declare (fixnum nexti)) 3829 (if (> nexti end) 3830 (return (values nchars (- i origin))) 3831 (setq i nexti)))))))) 3818 3832 :decode-literal-code-unit-limit #xd800 3819 3833 :encode-literal-char-code-limit #x10000 … … 3847 3861 (* 2 (- end start)) 3848 3862 0)) 3863 3864 (defun ucs-2-length-of-vector-encoding (vector start end) 3865 (declare (ignore vector)) 3866 (let* ((noctets (max (- end start) 0))) 3867 (values (ash noctets -1) (+ start (logandc2 noctets 1))))) 3868 3869 (defun ucs-2-length-of-memory-encoding (pointer noctets start) 3870 (declare (ignore pointer start)) 3871 (values (ash noctets -1) (logandc2 noctets 1))) 3872 3849 3873 3850 3874 … … 3895 3919 (index idx (+ 2 index))) 3896 3920 ((>= index end) index) 3897 (declare (fixnum i endindex))3921 (declare (fixnum i len index)) 3898 3922 (setf (schar string i) 3899 3923 (or (code-char (%native-u8-ref-u16 vector index)) … … 3928 3952 #'ucs-2-octets-in-string 3929 3953 :length-of-vector-encoding-function 3930 (nfunction 3931 native-ucs-2-length-of-vector-encoding 3932 (lambda (vector start end) 3933 (declare (ignore vector)) 3934 (do* ((i start (1+ i)) 3935 (j (+ i 2) (+ i 2)) 3936 (nchars 0 (1+ nchars))) 3937 ((> j end) (values nchars i))))) 3954 #'ucs-2-length-of-vector-encoding 3938 3955 :length-of-memory-encoding-function 3939 (nfunction 3940 native-ucs-2-length-of-memory-encoding 3941 (lambda (pointer noctets start) 3942 (declare (ignore pointer)) 3943 (values (floor noctets 2) (+ start noctets)))) 3956 #'ucs-2-length-of-memory-encoding 3944 3957 :decode-literal-code-unit-limit #x10000 3945 3958 :encode-literal-char-code-limit #x10000 … … 3994 4007 (index idx (+ 2 index))) 3995 4008 ((>= index end) index) 3996 (declare (fixnum i endindex))4009 (declare (fixnum i len index)) 3997 4010 (setf (schar string i) 3998 4011 (or (code-char (%reversed-u8-ref-u16 vector index)) … … 4027 4040 #'ucs-2-octets-in-string 4028 4041 :length-of-vector-encoding-function 4029 (nfunction 4030 reversed-ucs-2-length-of-vector-encoding 4031 (lambda (vector start end) 4032 (declare (ignore vector)) 4033 (do* ((i start (1+ i)) 4034 (j (+ i 2) (+ i 2)) 4035 (nchars 0 (1+ nchars))) 4036 ((> j end) (values nchars i))))) 4042 #'ucs-2-length-of-vector-encoding 4037 4043 :length-of-memory-encoding-function 4038 (nfunction 4039 reversed-ucs-2-length-of-memory-encoding 4040 (lambda (pointer noctets start) 4041 (declare (ignore pointer)) 4042 (values (floor noctets 2) (+ start noctets)))) 4044 #'ucs-2-length-of-memory-encoding 4043 4045 :decode-literal-code-unit-limit #x10000 4044 4046 :encode-literal-char-code-limit #x10000 … … 4066 4068 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 4067 4069 (fixnum idx)) 4070 (setf (%native-u8-ref-u16 vector idx) byte-order-mark-char-code) 4071 (incf idx 2) 4068 4072 (do* ((i start (1+ i))) 4069 4073 ((>= i end) idx) … … 4094 4098 (index idx (1+ index))) 4095 4099 ((>= index end) index) 4096 (declare (fixnum i endindex))4100 (declare (fixnum i len index)) 4097 4101 (let* ((1st-unit (if swap 4098 4102 (%reversed-u8-ref-u16 vector index) … … 4105 4109 (lambda (string pointer idx start end) 4106 4110 (declare (fixnum idx)) 4111 (setf (%get-unsigned-word pointer idx) byte-order-mark-char-code) 4112 (incf idx 2) 4107 4113 (do* ((i start (1+ i))) 4108 4114 ((>= i end) idx) … … 4139 4145 (setf (schar string i) (or (code-char 1st-unit) #\Replacement_Character))))))) 4140 4146 :octets-in-string-function 4141 #'ucs-2-octets-in-string 4147 (nfunction 4148 ucs-2-bom-octets-in-string 4149 (lambda (string start end) 4150 (+ 2 (ucs-2-octets-in-string string start end)))) 4142 4151 :length-of-vector-encoding-function 4143 4152 (nfunction 4144 4153 ucs-2-length-of-vector-encoding 4145 4154 (lambda (vector start end) 4146 (declare (ignore vector)) 4147 (do* ((i start (1+ i)) 4148 (j (+ i 2) (+ i 2)) 4155 (declare (fixnum start end)) 4156 (when (>= end (+ start 2)) 4157 (let* ((maybe-bom (%native-u8-ref-u16 vector start))) 4158 (declare (type (unsigned-byte 16) maybe-bom)) 4159 (when (or (= maybe-bom byte-order-mark-char-code) 4160 (= maybe-bom swapped-byte-order-mark-char-code)) 4161 (incf start 2)))) 4162 (do* ((i start j) 4163 (j (+ i 2) (+ j 2)) 4149 4164 (nchars 0 (1+ nchars))) 4150 4165 ((> j end) (values nchars i))))) … … 4153 4168 ucs-2-length-of-memory-encoding 4154 4169 (lambda (pointer noctets start) 4155 (when (> noctets 1) 4156 (case (%get-unsigned-word pointer ) 4157 (#.byte-order-mark-char-code 4158 (incf start 2) 4159 (decf noctets 2)) 4160 (#.swapped-byte-order-mark-char-code 4161 (incf start 2) 4162 (decf noctets 2)))) 4163 (values (floor noctets 2) (+ start noctets)))) 4170 (let* ((skip 4171 (when (> noctets 1) 4172 (case (%get-unsigned-word pointer start) 4173 (#.byte-order-mark-char-code 4174 2) 4175 (#.swapped-byte-order-mark-char-code 4176 2))))) 4177 (values (ash (- noctets skip) -1) (logandc2 noctets 1))))) 4164 4178 :decode-literal-code-unit-limit #x10000 4165 4179 :encode-literal-char-code-limit #x10000 … … 4299 4313 (index idx (+ 4 index))) 4300 4314 ((>= index end) index) 4301 (declare (fixnum i endindex))4315 (declare (fixnum i len index)) 4302 4316 (let* ((code (%native-u8-ref-u32 vector index))) 4303 4317 (declare (type (unsigned-byte 32) code)) … … 4338 4352 (lambda (vector start end) 4339 4353 (declare (ignore vector)) 4340 (do* ((i start (1+ i))4341 (j (+ i 4) (+ i4))4354 (do* ((i start j) 4355 (j (+ i 4) (+ j 4)) 4342 4356 (nchars 0 (1+ nchars))) 4343 4357 ((> j end) (values nchars i))))) … … 4347 4361 (lambda (pointer noctets start) 4348 4362 (declare (ignore pointer)) 4349 (values ( floor noctets 4) (+ start noctets))))4363 (values (ash noctets -2) (+ start (logandc2 noctets 3))))) 4350 4364 :decode-literal-code-unit-limit #x110000 4351 4365 :encode-literal-char-code-limit #x110000 … … 4396 4410 (index idx (+ 4 index))) 4397 4411 ((>= index end) index) 4398 (declare (fixnum i endindex))4412 (declare (fixnum i len index)) 4399 4413 (let* ((code (%reversed-u8-ref-u32 vector index))) 4400 4414 (declare (type (unsigned-byte 32) code)) … … 4436 4450 (lambda (vector start end) 4437 4451 (declare (ignore vector)) 4438 (do* ((i start (1+ i))4439 (j (+ i 4) (+ i4))4452 (do* ((i start j) 4453 (j (+ i 4) (+ j 4)) 4440 4454 (nchars 0 (1+ nchars))) 4441 4455 ((> j end) (values nchars i))))) … … 4445 4459 (lambda (pointer noctets start) 4446 4460 (declare (ignore pointer)) 4447 (values ( floor noctets 4) (+ start noctets))))4461 (values (ash noctets -2) (+ start (logandc2 noctets 3))))) 4448 4462 :decode-literal-code-unit-limit #x110000 4449 4463 :encode-literal-char-code-limit #x110000 … … 4454 4468 "A 32-bit, fixed-length encoding in which all Unicode characters can be encoded in a single 32-bit word. The endianness of the encoded data is indicated by the endianness of a byte-order-mark character (#\u+feff) prepended to the data; in the absence of such a character on input, input data is assumed to be in big-endian order. Output is written in native byte order with a leading byte-order mark." 4455 4469 4456 :aliases '(:u tf-4)4470 :aliases '(:ucs-4) 4457 4471 :max-units-per-char 1 4458 4472 :code-unit-size 32 … … 4468 4482 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 4469 4483 (fixnum idx)) 4484 (setf (%native-u8-ref-u32 vector idx) byte-order-mark-char-code) 4485 (incf idx 4) 4470 4486 (do* ((i start (1+ i))) 4471 4487 ((>= i end) idx) … … 4494 4510 (index idx (1+ index))) 4495 4511 ((>= index end) index) 4496 (declare (fixnum i endindex))4512 (declare (fixnum i len index)) 4497 4513 (let* ((1st-unit (if swap 4498 4514 (%reversed-u8-ref-u32 vector index) … … 4507 4523 (lambda (string pointer idx start end) 4508 4524 (declare (fixnum idx)) 4509 4525 (setf (%get-unsigned-long pointer idx) byte-order-mark-char-code) 4526 (incf idx 4) 4510 4527 (do* ((i start (1+ i))) 4511 4528 ((>= i end) idx) … … 4541 4558 #\Replacement_Character))))))) 4542 4559 :octets-in-string-function 4543 #'ucs-4-octets-in-string 4560 (nfunction 4561 utf-32-bom-octets-in-string 4562 (lambda (string start end) 4563 (+ 4 (ucs-4-octets-in-string string start end)))) 4544 4564 :length-of-vector-encoding-function 4545 4565 (nfunction 4546 4566 utf-32-length-of-vector-encoding 4547 4567 (lambda (vector start end) 4548 (declare (ignore vector)) 4549 (do* ((i start (1+ i)) 4550 (j (+ i 2) (+ i 2)) 4568 (when (>= end (+ start 4)) 4569 (let* ((maybe-bom (%native-u8-ref-u32 vector start))) 4570 (declare (type (unsigned-byte 32) maybe-bom)) 4571 (when (or (= maybe-bom byte-order-mark-char-code) 4572 (= maybe-bom swapped-byte-order-mark-char-code)) 4573 (incf start 4)))) 4574 (do* ((i start j) 4575 (j (+ i 4) (+ J 4)) 4551 4576 (nchars 0 (1+ nchars))) 4552 4577 ((> j end) (values nchars i))))) … … 4555 4580 utf-32-length-of-memory-encoding 4556 4581 (lambda (pointer noctets start) 4557 (when (> noctets 1)4582 (when (> noctets 3) 4558 4583 (case (%get-unsigned-long pointer ) 4559 4584 (#.byte-order-mark-char-code … … 4563 4588 (incf start 4) 4564 4589 (decf noctets 4)))) 4565 (values ( floor noctets 4) (+ start noctets))))4590 (values (ash noctets -2) (+ start (logandc2 noctets 3))))) 4566 4591 :decode-literal-code-unit-limit #x110000 4567 4592 :encode-literal-char-code-limit #x110000 … … 4604 4629 (defvar *cr-newline-string* (make-string 1 :initial-element #\Return)) 4605 4630 (defvar *crlf-newline-string* (make-array 2 :element-type 'character :initial-contents '(#\Return #\Linefeed))) 4606 (defvar *nul-string (make-string 1 :initial-element #\Nul))4631 (defvar *nul-string* (make-string 1 :initial-element #\Nul)) 4607 4632 4608 4633 (defun string-size-in-octets (string &key
Note:
See TracChangeset
for help on using the changeset viewer.
