Changeset 5228
- Timestamp:
- Sep 20, 2006, 3:20:50 AM (18 years ago)
- File:
-
- 1 edited
-
trunk/ccl/level-1/l1-unicode.lisp (modified) (5 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/ccl/level-1/l1-unicode.lisp
r5203 r5228 23 23 (defvar *character-encodings* (make-hash-table :test #'eq)) 24 24 25 (defun lookup-character-encoding (name) 26 (gethash name *character-encodings*)) 27 25 28 (defun get-character-encoding (name) 26 (or ( gethash name *character-encodings*)29 (or (lookup-character-encoding name) 27 30 (error "Unknown character encoding: ~s." name))) 28 31 … … 64 67 ;; sum of the index arg and the number of units consumed, else 65 68 ;; NIL and the incoming index arg if the character can't be 66 ;; encoded. (Note that the index args are octet offsets and67 ;; the return values should be scaled appropriately.)69 ;; encoded. (Note that the index args are and return value 70 ;; are "code unit indices", not "byte offsets".) 68 71 memory-decode-function ;(POINTER INDEX) 69 72 … … 310 313 (when (< index len) 311 314 (setf (aref vector index) code) 312 (the fixnum (+ index 1)) 313 (let* ((i1 (1+ index))) 314 (declare (fixnum i1)) 315 (if (< code #x800) 316 (when (< i1 len) 317 (setf (aref vector index) 318 (logior #xc0 (the fixnum (ash code -6))) 319 (aref vector i1) 320 (logior #x80 (the fixnum (logand code #x3f)))) 321 (the fixnum (+ i1 1))) 322 (let* ((i2 (1+ i1))) 323 (declare (fixnum i2)) 324 (if (< code #x10000) 325 (when (< i2 len) 315 (the fixnum (+ index 1))) 316 (let* ((i1 (1+ index))) 317 (declare (fixnum i1)) 318 (if (< code #x800) 319 (when (< i1 len) 320 (setf (aref vector index) 321 (logior #xc0 (the fixnum (ash code -6))) 322 (aref vector i1) 323 (logior #x80 (the fixnum (logand code #x3f)))) 324 (the fixnum (+ i1 1))) 325 (let* ((i2 (1+ i1))) 326 (declare (fixnum i2)) 327 (if (< code #x10000) 328 (when (< i2 len) 329 (setf (aref vector index) 330 (logior #xe0 (the fixnum (ash code -12))) 331 (aref vector i1) 332 (logior #x80 (the fixnum (logand #x3f (the fixnum (ash code -6))))) 333 (aref vector i2) 334 (logior #x80 (the fixnum (logand code #x3f)))) 335 (the fixnum (+ i2 1))) 336 (let* ((i3 (1+ i2))) 337 (declare (fixnum i3)) 338 (when (< i3 len) 326 339 (setf (aref vector index) 327 (logior #xe0 (the fixnum (ash code -12))) 340 (logior #xf0 341 (the fixnum (logand #x7 (the fixnum (ash code -18))))) 328 342 (aref vector i1) 343 (logior #x80 (the fixnum (logand #x3f (the fixnum (ash code -12))))) 344 (aref vector i2) 329 345 (logior #x80 (the fixnum (logand #x3f (the fixnum (ash code -6))))) 330 (aref vector i2) 331 (logior #x80 (the fixnum (logand code #x3f)))) 332 (the fixnum (+ i2 1))) 333 (let* ((i3 (1+ i2))) 334 (declare (fixnum i3)) 335 (when (< i3 len) 336 (setf (aref vector index) 337 (logior #xf0 338 (the fixnum (logand #x7 (the fixnum (ash code -18))))) 339 (aref vector i1) 340 (logior #x80 (the fixnum (logand #x3f (the fixnum (ash code -12))))) 341 (aref vector i2) 342 (logior #x80 (the fixnum (logand #x3f (the fixnum (ash code -6))))) 343 (aref vector i3) 344 (logand #x3f code)) 345 (the fixnum (+ i3 1))))))))))))) 346 (aref vector i3) 347 (logand #x3f code)) 348 (the fixnum (+ i3 1)))))))))))) 346 349 :vector-decode-function 347 350 (nfunction … … 437 440 (i2 (1+ i1)) 438 441 (i3 (1+ i2))) 439 (declare (type (mod #x110000) code i1 i2 i3)) 442 (declare (type (mod #x110000) code) 443 (fixnum i1 i2 i3)) 440 444 (cond ((< code #x80) 441 445 (setf (%get-unsigned-byte pointer idx) code) … … 590 594 ) 591 595 596 ;;; For a code-unit-size greater than 8: the stream-encode function's write-function 597 ;;; accepts a code-unit in native byte order and swaps it if necessary and the 598 ;;; stream-decode function receives a first-unit in native byte order and its 599 ;;; next-unit-function returns a unit in native byte order. The memory/vector 600 ;;; functions have to do their own byte swapping. 601 602 603 (defun utf-16-stream-encode (char write-function stream) 604 (let* ((code (char-code char)) 605 (highbits (- code #x10000))) 606 (declare (type (mod #x110000) code) 607 (fixnum highbits)) 608 (if (< highbits 0) 609 (progn 610 (funcall write-function stream code) 611 1) 612 (progn 613 (funcall write-function stream (logior #xd800 (the fixnum (ash highbits -10)))) 614 (funcall write-function (logior #xdc00 (the fixnum (logand highbits #x3ff)))) 615 2)))) 616 617 (defun utf-16-stream-decode (1st-unit next-unit-function stream) 618 (declare (type (unsigned-byte 16) 1st-unit)) 619 (if (or (< 1st-unit #xd800) 620 (>= 1st-unit #xe000)) 621 (code-char 1st-unit) 622 (if (< 1st-unit #xdc00) 623 (let* ((2nd-unit (funcall next-unit-function stream))) 624 (if (eq 2nd-unit :eof) 625 2nd-unit 626 (locally (declare (type (unsigned-byte 16) 2nd-unit)) 627 (if (and (>= 2nd-unit #xdc00) 628 (< 2nd-unit #xe000)) 629 (code-char (the (unsigned-byte 21) 630 (logior 631 (the (unsigned-byte 20) (ash (the (unsigned-byte 10) 632 (- 1st-unit #xd800)) 633 10)) 634 (the (unsigned-byte 10) (- 2nd-unit #xdc00)))))))))))) 635 636 637 (defun utf-16-units-in-string (string &optional (start 0) (end (length string))) 638 (when (>= end start) 639 (do* ((nunits 0) 640 (i start (1+ i))) 641 ((= i end) nunits) 642 (declare (fixnum nunits)) 643 (let* ((code (char-code (schar string i)))) 644 (declare (type (mod #x110000) code)) 645 (incf nunits 646 (if (< code #x10000) 647 1 648 2)))))) 649 650 ;;; utf-16, native byte order. 651 (define-character-encoding 652 #+big-endian-target :utf-16be #-big-endian-target :utf-16le 653 :max-units-per-char 2 654 :code-unit-size 16 655 :native-endianness t 656 :stream-encode-function 657 #'utf-16-stream-encode 658 :stream-decode-function 659 #'utf-16-stream-decode 660 :vector-encode-function 661 (nfunction 662 native-utf-16-vector-encode 663 (lambda (char vector index) 664 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 665 (type index index) 666 (optimize (speed 3) (safety 0))) 667 (let* ((len (length vector)) 668 (code (char-code char)) 669 (highbits (- code #x10000))) 670 (declare (type index len) 671 (type (mod #x110000) code) 672 (fixnum highbits)) 673 (if (< highbits 0) 674 (when (< index len) 675 (setf (aref vector index) code) 676 (the fixnum (+ index 1))) 677 (let* ((i1 (1+ index))) 678 (declare (fixnum i1)) 679 (when (< i1 len) 680 (setf (aref vector index) (logior #xd800 (the fixnum (ash highbits -10))) 681 (aref vector i1) (logior #xdc00 (the fixnum (logand highbits #x3ff)))) 682 (the fixnum (1+ i1)))))))) 683 :vector-decode-function 684 (nfunction 685 native-utf-16-vector-decode 686 (lambda (vector idx) 687 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 688 (type index idx)) 689 (let* ((len (length vector))) 690 (declare (fixnum len)) 691 (if (>= idx len) 692 (values nil idx) 693 (let* ((1st-unit (aref vector idx))) 694 (declare (type (unsigned-byte 16) 1st-unit)) 695 (if (or (< 1st-unit #xd800) 696 (>= 1st-unit #xe000)) 697 (values (code-char 1st-unit) 698 (the fixnum (1+ idx))) 699 (if (>= 1st-unit #xdc00) 700 (values nil idx) 701 (let* ((i1 (1+ idx))) 702 (declare (fixnum i1)) 703 (if (>= i1 len) 704 (values nil idx) 705 (let* ((2nd-unit (aref vector i1))) 706 (declare (type (unsigned-byte 16) 2nd-unit)) 707 (if (and (>= 2nd-unit #xdc00) 708 (< 2nd-unit #xe000)) 709 (values 710 (code-char (the (unsigned-byte 21) 711 (logior 712 (the (unsigned-byte 20) (ash (the (unsigned-byte 10) 713 (- 1st-unit #xd800)) 714 10)) 715 (the (unsigned-byte 10) (- 2nd-unit #xdc00))))) 716 (the fixnum (1+ i1))) 717 (values nil idx)))))))))))) 718 :memory-encode-function 719 (nfunction 720 native-utf-16-memory-encode 721 (lambda (char pointer idx) 722 (declare (fixnum idx)) 723 (let* ((code (char-code char)) 724 (highbits (- code #x10000)) 725 (i0 (+ idx idx)) 726 (i1 (+ i0 2))) 727 (declare (type (mod #x110000) code) 728 (fixnum i0 i1 highbits)) 729 (cond ((< highbits 0) 730 (setf (%get-unsigned-word pointer i0) code) 731 (the fixnum (1+ idx))) 732 733 (t 734 (setf (%get-unsigned-word pointer i0) (logior #xd800 (the fixnum (ash highbits -10))) 735 (%get-unsigned-word pointer i1) (logior #xdc00 (the fixnum (logand highbits #x3ff)))) 736 737 (the fixnum (+ idx 2))))))) 738 :memory-decode-function 739 (nfunction 740 native-utf-16-memory-decode 741 (lambda (pointer idx) 742 (declare (fixnum idx)) 743 (let* ((i0 (+ idx idx)) 744 (1st-unit (%get-unsigned-word pointer i0)) 745 (i1 (+ i0 2))) 746 (declare (type (unsigned-byte 16) 1st-unit) 747 (fixnum i1 i2 i3)) 748 (if (or (< 1st-unit #xd800) 749 (>= 1st-unit #xe000)) 750 (values (code-char 1st-unit) (the fixnum (1+ idx))) 751 (if (< 1st-unit #xdc00) 752 (let* ((2nd-unit (%get-unsigned-word pointer i1))) 753 (declare (type (unsigned-byte 16) 2nd-unit)) 754 (if (and (>= 2nd-unit #xdc00) 755 (< 2nd-unit #xe000)) 756 (values 757 (code-char (the (unsigned-byte 21) 758 (logior 759 (the (unsigned-byte 20) (ash (the (unsigned-byte 10) 760 (- 1st-unit #xd800)) 761 10)) 762 (the (unsigned-byte 10) (- 2nd-unit #xdc00))))) 763 (the fixnum (+ idx 2)))))))))) 764 :units-in-string-function 765 #'utf-16-units-in-string 766 :length-of-vector-encoding-function 767 (nfunction 768 native-utf-16-length-of-vector-encoding 769 (lambda (vector &optional (start 0) (end (length vector))) 770 (declare (type (simple-array (unsigned-byte 16) (*)) vector)) 771 (do* ((i start) 772 (nchars 0 (1+ nchars))) 773 ((>= i end) 774 (if (= i end) nchars)) 775 (let* ((code (aref vector i))) 776 (declare (type (unsigned-byte 8) code)) 777 (incf i 778 (if (or (< code #xd800) 779 (>= code #xe000)) 780 1 781 2)))))) 782 :length-of-memory-encoding-function 783 (nfunction 784 native-utf-8-length-of-memory-encoding 785 (lambda (pointer nunits &optional (start 0)) 786 (do* ((i start) 787 (p (+ start start) (+ p 2)) 788 (nchars 0 (1+ nchars))) 789 ((>= i nunits) 790 (if (= i nunits) nchars)) 791 (let* ((code (%get-unsigned-word pointer p))) 792 (declare (type (unsigned-byte 16) code)) 793 (incf i 794 (incf i 795 (if (or (< code #xd800) 796 (>= code #xe000)) 797 1 798 2))))))) 799 :literal-char-code-limit #x10000 800 ) 801 802 ;;; utf-16, reversed byte order 803 (define-character-encoding 804 #+big-endian-target :utf-16le #-big-endian-target :utf-16be 805 :max-units-per-char 2 806 :code-unit-size 16 807 :native-endianness nil 808 :stream-encode-function 809 #'utf-16-stream-encode 810 :stream-decode-function 811 #'utf-16-stream-decode 812 :vector-encode-function 813 (nfunction 814 reversed-utf-16-vector-encode 815 (lambda (char vector index) 816 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 817 (type index index) 818 (optimize (speed 3) (safety 0))) 819 (let* ((len (length vector)) 820 (code (char-code char)) 821 (highbits (- code #x10000))) 822 (declare (type index len) 823 (type (mod #x110000) code) 824 (fixnum highbits)) 825 (if (< highbits 0) 826 (when (< index len) 827 (setf (aref vector index) (%swap-u16 code)) 828 (the fixnum (+ index 1))) 829 (let* ((i1 (1+ index))) 830 (declare (fixnum i1)) 831 (when (< i1 len) 832 (setf (aref vector index) 833 (%swap-u16 (logior #xd800 (the fixnum (ash highbits -10)))) 834 (aref vector i1) 835 (%swap-u16 (logior #xdc00 (the fixnum (logand highbits #x3ff))))) 836 (the fixnum (1+ i1)))))))) 837 :vector-decode-function 838 (nfunction 839 reversed-utf-16-vector-decode 840 (lambda (vector idx) 841 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 842 (type index idx)) 843 (let* ((len (length vector))) 844 (declare (fixnum len)) 845 (if (>= idx len) 846 (values nil idx) 847 (let* ((1st-unit (%swap-u16 (aref vector idx)))) 848 (declare (type (unsigned-byte 16) 1st-unit)) 849 (if (or (< 1st-unit #xd800) 850 (>= 1st-unit #xe000)) 851 (values (code-char 1st-unit) 852 (the fixnum (1+ idx))) 853 (if (>= 1st-unit #xdc00) 854 (values nil idx) 855 (let* ((i1 (1+ idx))) 856 (declare (fixnum i1)) 857 (if (>= i1 len) 858 (values nil idx) 859 (let* ((2nd-unit (%swap-u16 (aref vector i1)))) 860 (declare (type (unsigned-byte 16) 2nd-unit)) 861 (if (and (>= 2nd-unit #xdc00) 862 (< 2nd-unit #xe000)) 863 (values 864 (code-char (the (unsigned-byte 21) 865 (logior 866 (the (unsigned-byte 20) (ash (the (unsigned-byte 10) 867 (- 1st-unit #xd800)) 868 10)) 869 (the (unsigned-byte 10) (- 2nd-unit #xdc00))))) 870 (the fixnum (1+ i1))) 871 (values nil idx)))))))))))) 872 :memory-encode-function 873 (nfunction 874 reversed-utf-16-memory-encode 875 (lambda (char pointer idx) 876 (declare (fixnum idx)) 877 (let* ((code (char-code char)) 878 (highbits (- code #x10000)) 879 (i0 (+ idx idx)) 880 (i1 (+ i0 2))) 881 (declare (type (mod #x110000) code) 882 (fixnum i0 i1 highbits)) 883 (cond ((< highbits 0) 884 (setf (%get-unsigned-word pointer i0) (%swap-u16 code)) 885 (the fixnum (1+ idx))) 886 (t 887 (setf (%get-unsigned-word pointer i0) 888 (%swap-u16 (logior #xd800 (the fixnum (ash highbits -10)))) 889 (%get-unsigned-word pointer i1) 890 (%swap-u16 (logior #xdc00 (the fixnum (logand highbits #x3ff))))) 891 (the fixnum (+ idx 2))))))) 892 :memory-decode-function 893 (nfunction 894 reversed-utf-16-memory-decode 895 (lambda (pointer idx) 896 (declare (fixnum idx)) 897 (let* ((i0 (+ idx idx)) 898 (1st-unit (%swap-u16 (%get-unsigned-word pointer i0))) 899 (i1 (+ i0 2))) 900 (declare (type (unsigned-byte 16) 1st-unit) 901 (fixnum i1 i2 i3)) 902 (if (or (< 1st-unit #xd800) 903 (>= 1st-unit #xe000)) 904 (values (code-char 1st-unit) (the fixnum (1+ idx))) 905 (if (< 1st-unit #xdc00) 906 (let* ((2nd-unit (%swap-u16 (%get-unsigned-word pointer i1)))) 907 (declare (type (unsigned-byte 16) 2nd-unit)) 908 (if (and (>= 2nd-unit #xdc00) 909 (< 2nd-unit #xe000)) 910 (values 911 (code-char (the (unsigned-byte 21) 912 (logior 913 (the (unsigned-byte 20) (ash (the (unsigned-byte 10) 914 (- 1st-unit #xd800)) 915 10)) 916 (the (unsigned-byte 10) (- 2nd-unit #xdc00))))) 917 (the fixnum (+ idx 2)))))))))) 918 :units-in-string-function 919 #'utf-16-units-in-string 920 :length-of-vector-encoding-function 921 (nfunction 922 reversed-utf-16-length-of-vector-encoding 923 (lambda (vector &optional (start 0) (end (length vector))) 924 (declare (type (simple-array (unsigned-byte 16) (*)) vector)) 925 (do* ((i start) 926 (nchars 0 (1+ nchars))) 927 ((>= i end) 928 (if (= i end) nchars)) 929 (let* ((code (%swap-u16 (aref vector i)))) 930 (declare (type (unsigned-byte 8) code)) 931 (incf i 932 (if (or (< code #xd800) 933 (>= code #xe000)) 934 1 935 2)))))) 936 :length-of-memory-encoding-function 937 (nfunction 938 reversed-utf-8-length-of-memory-encoding 939 (lambda (pointer nunits &optional (start 0)) 940 (do* ((i start) 941 (p (+ start start) (+ p 2)) 942 (nchars 0 (1+ nchars))) 943 ((>= i nunits) 944 (if (= i nunits) nchars)) 945 (let* ((code (%swap-u16 (%get-unsigned-word pointer p)))) 946 (declare (type (unsigned-byte 8) code)) 947 (incf i 948 (incf i 949 (if (or (< code #xd800) 950 (>= code #xe000)) 951 1 952 2))))))) 953 :literal-char-code-limit #x10000 954 )
Note:
See TracChangeset
for help on using the changeset viewer.
