Changeset 5294
- Timestamp:
- Oct 5, 2006, 5:08:03 AM (18 years ago)
- File:
-
- 1 edited
-
trunk/ccl/level-1/l1-unicode.lisp (modified) (12 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/ccl/level-1/l1-unicode.lisp
r5278 r5294 89 89 ;; that implements this encoding with swapped byte order. 90 90 (use-byte-order-mark nil) 91 ;; Can we reliably (and dumbly) assume that code-units that appear 92 ;; to represent #\u+000a and #\u+000d in fact represent LF and CR ? 93 (allows-line-termination-detection t) 94 ;; By what other MIME names is this encoding known ? 95 (aliases nil) 91 96 (documentation nil) 92 97 ) … … 109 114 (defmacro define-character-encoding (name doc &rest args &key &allow-other-keys) 110 115 (setq name (intern (string name) "KEYWORD")) 111 `(progn 112 (setf (get-character-encoding ,name) 113 (make-character-encoding :name ,name :documentation ',doc ,@args)))) 116 (let* ((encoding (gensym)) 117 (alias (gensym))) 118 `(let* ((,encoding (make-character-encoding :name ,name :documentation ,doc ,@args))) 119 (setf (get-character-encoding ,name) ,encoding) 120 (dolist (,alias (character-encoding-aliases ,encoding)) 121 (setf (get-character-encoding ,alias) ,encoding)) 122 ',name))) 114 123 115 124 (defun encoding-name (encoding) … … 123 132 codes map to their Unicode equivalents. Intended to support most 124 133 characters used in most Western European languages." 134 135 ;; The NIL alias is used internally to mean that ISO-8859-1 is 136 ;; the "null" 8-bit encoding 137 :aliases '(nil :iso_8859-1 :latin1 :l1 :ibm819 :cp819 :csISOLatin1) 138 125 139 :stream-encode-function 126 140 (nfunction … … 215 229 ) 216 230 217 ;;; Make :ISO-8859-1 the "null" encoding (not necessarily the default). 218 (setf (get-character-encoding nil) 219 (get-character-encoding :iso-8859-1)) 231 220 232 221 233 … … 295 307 character values. Intended to provide most characters found in most 296 308 languages used in Central/Eastern Europe." 309 :aliases '(:iso_8859-2 :latin-2 :l2 :csISOLatin2) 297 310 :stream-encode-function 298 311 (nfunction … … 495 508 character values. Intended to provide most characters found in most 496 509 languages used in Southern Europe." 510 511 :aliases '(:iso_8859-3 :latin3 :l3 :csisolatin3) 497 512 :stream-encode-function 498 513 (nfunction … … 709 724 character values. Intended to provide most characters found in most 710 725 languages used in Northern Europe." 726 727 :aliases '(:iso_8859-4 :latin4 :l4 :csisolatin4) 711 728 :stream-encode-function 712 729 (nfunction … … 1630 1647 prepended to the data; in the absence of such a character on input, 1631 1648 the data is assumed to be in big-endian order." 1632 :max-units-per-char 21633 :code-unit-size 161634 :native-endianness t;not necessarily true.1635 :stream-encode-function1636 #'utf-16-stream-encode1637 :stream-decode-function1638 #'utf-16-stream-decode1639 :vector-encode-function1640 (nfunction1641 utf-16-vector-encode1642 (lambda (string vector idx &optional (start 0) (end (length string)))1643 (declare (type (simple-array (unsigned-byte 8) (*)) vector)1644 (fixnum idx))1645 (when (> end start)1646 (setf (aref vector idx) byte-order-mark-char-code)1647 (incf idx))1648 (do* ((i start (1+ i)))1649 ((>= i end) idx)1650 (let* ((char (schar string i))1651 (code (char-code char))1652 (highbits (- code #x10000)))1653 (declare (type (mod #x110000) code)1654 (fixnum highbits))1655 (cond ((< highbits 0)1656 (setf (aref vector idx) code)1657 (incf idx))1658 (t1659 (setf (aref vector idx) (logior #xd800 (the fixnum (ash highbits -10))))1660 (incf idx)1661 (setf (aref vector idx) (logior #xdc00 (the fixnum (logand highbits #x3ff))))1662 (incf idx)))))))1663 :vector-decode-function1664 (nfunction1665 utf-16-vector-decode1666 (lambda (vector idx nunits string)1667 (declare (type (simple-array (unsigned-byte 16) (*)) vector)1668 (type index idx))1669 (let* ((len (length vector))1670 (swap (if (> len idx)1671 (case (aref vector idx)1672 (#.byte-order-mark-char-code1673 (incf idx) nil)1674 (#.swapped-byte-order-mark-char-code1675 (incf idx t))1676 (t #+little-endian-target t)))))1649 :max-units-per-char 2 1650 :code-unit-size 16 1651 :native-endianness t ;not necessarily true. 1652 :stream-encode-function 1653 #'utf-16-stream-encode 1654 :stream-decode-function 1655 #'utf-16-stream-decode 1656 :vector-encode-function 1657 (nfunction 1658 utf-16-vector-encode 1659 (lambda (string vector idx &optional (start 0) (end (length string))) 1660 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 1661 (fixnum idx)) 1662 (when (> end start) 1663 (setf (aref vector idx) byte-order-mark-char-code) 1664 (incf idx)) 1665 (do* ((i start (1+ i))) 1666 ((>= i end) idx) 1667 (let* ((char (schar string i)) 1668 (code (char-code char)) 1669 (highbits (- code #x10000))) 1670 (declare (type (mod #x110000) code) 1671 (fixnum highbits)) 1672 (cond ((< highbits 0) 1673 (setf (aref vector idx) code) 1674 (incf idx)) 1675 (t 1676 (setf (aref vector idx) (logior #xd800 (the fixnum (ash highbits -10)))) 1677 (incf idx) 1678 (setf (aref vector idx) (logior #xdc00 (the fixnum (logand highbits #x3ff)))) 1679 (incf idx))))))) 1680 :vector-decode-function 1681 (nfunction 1682 utf-16-vector-decode 1683 (lambda (vector idx nunits string) 1684 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 1685 (type index idx)) 1686 (let* ((len (length vector)) 1687 (swap (if (> len idx) 1688 (case (aref vector idx) 1689 (#.byte-order-mark-char-code 1690 (incf idx) nil) 1691 (#.swapped-byte-order-mark-char-code 1692 (incf idx t)) 1693 (t #+little-endian-target t))))) 1677 1694 1678 1695 (do* ((i 0 (1+ i)) … … 1704 1721 (setf (schar string i) char) 1705 1722 (return (values nil idx)))))))))) 1706 :memory-encode-function1707 (nfunction1708 utf-16-memory-encode1709 (lambda (string pointer idx &optional (start 0) (end (length string)))1710 (declare (fixnum idx))1711 (when (> end start)1712 (setf (%get-unsigned-word pointer (+ idx idx))1713 byte-order-mark-char-code)1714 (incf idx))1715 (do* ((i start (1+ i)))1716 ((>= i end) idx)1717 (let* ((code (char-code (schar string i)))1718 (highbits (- code #x10000))1723 :memory-encode-function 1724 (nfunction 1725 utf-16-memory-encode 1726 (lambda (string pointer idx &optional (start 0) (end (length string))) 1727 (declare (fixnum idx)) 1728 (when (> end start) 1729 (setf (%get-unsigned-word pointer (+ idx idx)) 1730 byte-order-mark-char-code) 1731 (incf idx)) 1732 (do* ((i start (1+ i))) 1733 ((>= i end) idx) 1734 (let* ((code (char-code (schar string i))) 1735 (highbits (- code #x10000)) 1719 1736 (p (+ idx idx))) 1720 (declare (type (mod #x110000) code)1737 (declare (type (mod #x110000) code) 1721 1738 (fixnum p highbits)) 1722 1739 (cond ((< highbits 0) … … 1732 1749 (incf idx) 1733 1750 (incf p 2))))))) 1734 :memory-decode-function1735 (nfunction1736 utf-16-memory-decode1737 (lambda (pointer nunits idx string)1738 (declare (fixnum nunits idx))1739 (let* ((swap (when (> nunits 0)1740 (case (%get-unsigned-word pointer (+ idx idx))1751 :memory-decode-function 1752 (nfunction 1753 utf-16-memory-decode 1754 (lambda (pointer nunits idx string) 1755 (declare (fixnum nunits idx)) 1756 (let* ((swap (when (> nunits 0) 1757 (case (%get-unsigned-word pointer (+ idx idx)) 1741 1758 (#.byte-order-mark-char-code 1742 1759 (incf idx) … … 1762 1779 (if (< 1st-unit #xdc00) 1763 1780 (let* ((2nd-unit (%get-unsigned-byte pointer (incf p 2)))) 1764 (declare (type (unsigned-byte 16) 2nd-unit))1765 (if swap (setq 2nd-unit (%swap-u16 2nd-unit)))1766 (incf index)1767 (if (and (>= 2nd-unit #xdc00)1768 (< 2nd-unit #xe000))1769 (code-char (the (unsigned-byte 21)1770 (logior1771 (the (unsigned-byte 20) (ash (the (unsigned-byte 10)1772 (- 1st-unit #xd800))1773 10))1774 (the (unsigned-byte 10) (- 2nd-unit #xdc00)))))))))))1781 (declare (type (unsigned-byte 16) 2nd-unit)) 1782 (if swap (setq 2nd-unit (%swap-u16 2nd-unit))) 1783 (incf index) 1784 (if (and (>= 2nd-unit #xdc00) 1785 (< 2nd-unit #xe000)) 1786 (code-char (the (unsigned-byte 21) 1787 (logior 1788 (the (unsigned-byte 20) (ash (the (unsigned-byte 10) 1789 (- 1st-unit #xd800)) 1790 10)) 1791 (the (unsigned-byte 10) (- 2nd-unit #xdc00))))))))))) 1775 1792 (if char 1776 1793 (setf (schar string i) char) 1777 1794 (return (values nil idx))))))))) 1778 :units-in-string-function 1779 ;; Note that this does -not- include the BOM. 1780 #'utf-16-units-in-string 1781 :length-of-vector-encoding-function 1782 (nfunction 1783 utf-16-length-of-vector-encoding 1784 (lambda (vector &optional (start 0) (end (length vector))) 1785 (declare (type (simple-array (unsigned-byte 16) (*)) vector)) 1786 (let* ((swap (when (> end start) 1787 (case (aref vector start) 1788 (#.byte-order-mark-char-code 1789 (incf start) 1790 nil) 1791 (#.swapped-byte-order-mark-char-code 1792 (incf start) 1793 t) 1794 (t #+little-endian-target t))))) 1795 (do* ((i start) 1796 (nchars 0 (1+ nchars))) 1797 ((>= i end) 1798 (if (= i end) nchars)) 1799 (let* ((code (aref vector i))) 1800 (declare (type (unsigned-byte 16) code)) 1801 (if swap (setq code (%swap-u16 code))) 1802 (incf i 1803 (if (or (< code #xd800) 1804 (>= code #xe000)) 1805 1 1806 2))))))) 1807 :length-of-memory-encoding-function 1808 (nfunction 1809 utf-16-length-of-memory-encoding 1810 (lambda (pointer nunits &optional (start 0)) 1811 (let* ((swap (when (> nunits 1) 1812 (case (%get-unsigned-word pointer (+ start start)) 1813 (#.byte-order-mark-char-code 1814 (incf start) 1815 (decf nunits) 1816 nil) 1817 (#.swapped-byte-order-mark-char-code 1818 (incf start) 1819 (decf nunits) 1820 t) 1821 (t #+little-endian-target t))))) 1795 :units-in-string-function 1796 #'(lambda (&rest args) 1797 (declare (dynamic-extent args)) 1798 ;; Add one for the BOM. 1799 (1+ (apply #'utf-16-units-in-string args))) 1800 :length-of-vector-encoding-function 1801 (nfunction 1802 utf-16-length-of-vector-encoding 1803 (lambda (vector &optional (start 0) (end (length vector))) 1804 (declare (type (simple-array (unsigned-byte 16) (*)) vector)) 1805 (let* ((swap (when (> end start) 1806 (case (aref vector start) 1807 (#.byte-order-mark-char-code 1808 (incf start) 1809 nil) 1810 (#.swapped-byte-order-mark-char-code 1811 (incf start) 1812 t) 1813 (t #+little-endian-target t))))) 1814 (do* ((i start) 1815 (nchars 0 (1+ nchars))) 1816 ((>= i end) 1817 (if (= i end) nchars)) 1818 (let* ((code (aref vector i))) 1819 (declare (type (unsigned-byte 16) code)) 1820 (if swap (setq code (%swap-u16 code))) 1821 (incf i 1822 (if (or (< code #xd800) 1823 (>= code #xe000)) 1824 1 1825 2))))))) 1826 :length-of-memory-encoding-function 1827 (nfunction 1828 utf-16-length-of-memory-encoding 1829 (lambda (pointer nunits &optional (start 0)) 1830 (let* ((swap (when (> nunits 1) 1831 (case (%get-unsigned-word pointer (+ start start)) 1832 (#.byte-order-mark-char-code 1833 (incf start) 1834 (decf nunits) 1835 nil) 1836 (#.swapped-byte-order-mark-char-code 1837 (incf start) 1838 (decf nunits) 1839 t) 1840 (t #+little-endian-target t))))) 1822 1841 (do* ((i start) 1823 1842 (p (+ start start) (+ p 2)) … … 1834 1853 1 1835 1854 2)))))))) 1836 :literal-char-code-limit #x10000 1837 :use-byte-order-mark 1838 #+big-endian-target :utf-16le 1839 #+little-endian-target :utf-16be 1840 ) 1841 1842 1843 1855 :literal-char-code-limit #x10000 1856 :use-byte-order-mark 1857 #+big-endian-target :utf-16le 1858 #+little-endian-target :utf-16be 1859 ) 1860 1861 1862 (defun ucs-2-stream-encode (char write-function stream) 1863 (let* ((code (char-code char))) 1864 (declare (type (mod #x110000) code)) 1865 (if (< code #x10000) 1866 (progn 1867 (funcall write-function stream code) 1868 1)))) 1869 1870 (defun ucs-2-stream-decode (1st-unit next-unit-function stream) 1871 (declare (type (unsigned-byte 16) 1st-unit) 1872 (ignore next-unit-function stream)) 1873 ;; CODE-CHAR returns NIL on either half of a surrogate pair. 1874 (code-char 1st-unit)) 1875 1876 1877 (defun ucs-2-units-in-string (string &optional (start 0) (end (length string))) 1878 (when (>= end start) 1879 (do* ((i start (1+ i))) 1880 ((= i end) (- end start)) 1881 (let* ((code (char-code (schar string i)))) 1882 (declare (type (mod #x110000) code)) 1883 (unless (< code #x10000) (return nil)))))) 1884 1885 ;;; UCS-2, native byte order 1886 (define-character-encoding #+big-endian-target :ucs-2be #-big-endian-target :ucs-2le 1887 #+big-endian-target 1888 "A 16-bit, variable-length encoding in which characters with 1889 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit 1890 little-endian word. The encoded data is implicitly big-endian; 1891 byte-order-mark characters are not interpreted on input or prepended 1892 to output." 1893 #+little-endian-target 1894 "A 16-bit, variable-length encoding in which characters with 1895 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit 1896 little-endian word. The encoded data is implicitly little-endian; 1897 byte-order-mark characters are not interpreted on input or prepended 1898 to output." 1899 :max-units-per-char 1 1900 :code-unit-size 16 1901 :native-endianness t 1902 :stream-encode-function 1903 #'ucs-2-stream-encode 1904 :stream-decode-function 1905 #'ucs-2-stream-decode 1906 :vector-encode-function 1907 (nfunction 1908 native-ucs-2-vector-encode 1909 (lambda (string vector idx &optional (start 0) (end (length string))) 1910 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 1911 (fixnum idx)) 1912 (do* ((i start (1+ i))) 1913 ((>= i end) idx) 1914 (let* ((char (schar string i)) 1915 (code (char-code char))) 1916 (declare (type (mod #x110000) code) 1917 (fixnum highbits)) 1918 (cond ((< code #x10000) 1919 (setf (aref vector idx) code) 1920 (incf idx)) 1921 (t (return nil))))))) 1922 :vector-decode-function 1923 (nfunction 1924 native-ucs-2-vector-decode 1925 (lambda (vector idx nunits string) 1926 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 1927 (type index idx)) 1928 (do* ((i 0 (1+ i)) 1929 (len (length vector)) 1930 (index idx (1+ index))) 1931 ((>= i nunits) (values string index)) 1932 (declare (fixnum i len index)) 1933 (if (>= index len) 1934 (values nil idx) 1935 (let* ((char (code-char (the (unsigned-byte 16) (aref vector index))))) 1936 (if char 1937 (setf (schar string i) char) 1938 (return (values nil idx)))))))) 1939 :memory-encode-function 1940 (nfunction 1941 native-ucs-2-memory-encode 1942 (lambda (string pointer idx &optional (start 0) (end (length string))) 1943 (declare (fixnum idx)) 1944 (do* ((i start (1+ i))) 1945 ((>= i end) idx) 1946 (let* ((code (char-code (schar string i))) 1947 (p (+ idx idx))) 1948 (declare (type (mod #x110000) code) 1949 (fixnum p highbits)) 1950 (cond ((< code #x10000) 1951 (setf (%get-unsigned-word pointer p) code) 1952 (incf idx) 1953 (incf p 2)) 1954 (t 1955 (return nil))))))) 1956 :memory-decode-function 1957 (nfunction 1958 native-ucs-2-memory-decode 1959 (lambda (pointer nunits idx string) 1960 (declare (fixnum nunits idx)) 1961 (do* ((i 0 (1+ i)) 1962 (index idx (1+ index)) 1963 (p (+ index index) (+ p 2))) 1964 ((>= i nunits) (values string index)) 1965 (declare (fixnum i index p)) 1966 (let* ((1st-unit (%get-unsigned-word pointer p))) 1967 (declare (type (unsigned-byte 16) 1st-unit)) 1968 (let* ((char (code-char 1st-unit))) 1969 (setf (schar string i) char) 1970 (return (values nil idx))))))) 1971 :units-in-string-function 1972 #'ucs-2-units-in-string 1973 :length-of-vector-encoding-function 1974 (nfunction 1975 native-ucs-2-length-of-vector-encoding 1976 (lambda (vector &optional (start 0) (end (length vector))) 1977 (when (>= end start) 1978 (- end start)))) 1979 :length-of-memory-encoding-function 1980 (nfunction 1981 native-ucs-2-length-of-memory-encoding 1982 (lambda (pointer nunits &optional start) 1983 (declare (ignore pointer start)) 1984 nunits)) 1985 :literal-char-code-limit #x10000 1986 ) 1987 1988 ;;; UCS-2, reversed byte order 1989 (define-character-encoding #+big-endian-target :ucs-2le #-big-endian-target :ucs-2be 1990 #+little-endian-target 1991 "A 16-bit, variable-length encoding in which characters with 1992 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit 1993 little-endian word. The encoded data is implicitly big-endian; 1994 byte-order-mark characters are not interpreted on input or prepended 1995 to output." 1996 #+big-endian-target 1997 "A 16-bit, variable-length encoding in which characters with 1998 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit 1999 little-endian word. The encoded data is implicitly little-endian; 2000 byte-order-mark characters are not interpreted on input or prepended 2001 to output." 2002 :max-units-per-char 1 2003 :code-unit-size 16 2004 :native-endianness nil 2005 :stream-encode-function 2006 #'ucs-2-stream-encode 2007 :stream-decode-function 2008 #'ucs-2-stream-decode 2009 :vector-encode-function 2010 (nfunction 2011 reversed-ucs-2-vector-encode 2012 (lambda (string vector idx &optional (start 0) (end (length string))) 2013 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 2014 (fixnum idx)) 2015 (do* ((i start (1+ i))) 2016 ((>= i end) idx) 2017 (let* ((char (schar string i)) 2018 (code (char-code char))) 2019 (declare (type (mod #x110000) code) 2020 (fixnum highbits)) 2021 (cond ((< code #x10000) 2022 (setf (aref vector idx) (%swap-u16 code)) 2023 (incf idx)) 2024 (t (return nil))))))) 2025 :vector-decode-function 2026 (nfunction 2027 reversed-ucs-2-vector-decode 2028 (lambda (vector idx nunits string) 2029 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 2030 (type index idx)) 2031 (do* ((i 0 (1+ i)) 2032 (len (length vector)) 2033 (index idx (1+ index))) 2034 ((>= i nunits) (values string index)) 2035 (declare (fixnum i len index)) 2036 (if (>= index len) 2037 (values nil idx) 2038 (let* ((char (code-char (the (unsigned-byte 16) (%swap-u16 (aref vector index)))))) 2039 (if char 2040 (setf (schar string i) char) 2041 (return (values nil idx)))))))) 2042 :memory-encode-function 2043 (nfunction 2044 reversed-ucs-2-memory-encode 2045 (lambda (string pointer idx &optional (start 0) (end (length string))) 2046 (declare (fixnum idx)) 2047 (do* ((i start (1+ i))) 2048 ((>= i end) idx) 2049 (let* ((code (char-code (schar string i))) 2050 (p (+ idx idx))) 2051 (declare (type (mod #x110000) code) 2052 (fixnum p highbits)) 2053 (cond ((< code #x10000) 2054 (setf (%get-unsigned-word pointer p) (%swap-u16 code)) 2055 (incf idx) 2056 (incf p 2)) 2057 (t 2058 (return nil))))))) 2059 :memory-decode-function 2060 (nfunction 2061 reversed-ucs-2-memory-decode 2062 (lambda (pointer nunits idx string) 2063 (declare (fixnum nunits idx)) 2064 (do* ((i 0 (1+ i)) 2065 (index idx (1+ index)) 2066 (p (+ index index) (+ p 2))) 2067 ((>= i nunits) (values string index)) 2068 (declare (fixnum i index p)) 2069 (let* ((1st-unit (%swap-u16 (%get-unsigned-word pointer p)))) 2070 (declare (type (unsigned-byte 16) 1st-unit)) 2071 (let* ((char (code-char 1st-unit))) 2072 (if char 2073 (setf (schar string i) char) 2074 (return (values nil idx)))))))) 2075 :units-in-string-function 2076 #'ucs-2-units-in-string 2077 :length-of-vector-encoding-function 2078 (nfunction 2079 reversed-ucs-2-length-of-vector-encoding 2080 (lambda (vector &optional (start 0) (end (length vector))) 2081 (when (>= end start) 2082 (- end start)))) 2083 :length-of-memory-encoding-function 2084 (nfunction 2085 reversed-ucs-2-length-of-memory-encoding 2086 (lambda (pointer nunits &optional start) 2087 (declare (ignore pointer start)) 2088 nunits)) 2089 :literal-char-code-limit #x10000 2090 ) 2091 2092 (define-character-encoding :ucs-2 2093 "A 16-bit, fixed-length encoding in which characters with 2094 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit words. 2095 The endianness of the encoded data is indicated by the endianness of a 2096 byte-order-mark character (#\u+feff) prepended to the data; in the 2097 absence of such a character on input, the data is assumed to be in 2098 big-endian order." 2099 :max-units-per-char 1 2100 :code-unit-size 16 2101 :native-endianness t ;not necessarily true. 2102 :stream-encode-function 2103 #'ucs-2-stream-encode 2104 :stream-decode-function 2105 #'ucs-2-stream-decode 2106 :vector-encode-function 2107 (nfunction 2108 ucs-2-vector-encode 2109 (lambda (string vector idx &optional (start 0) (end (length string))) 2110 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 2111 (fixnum idx)) 2112 (when (> end start) 2113 (setf (aref vector idx) byte-order-mark-char-code) 2114 (incf idx)) 2115 (do* ((i start (1+ i))) 2116 ((>= i end) idx) 2117 (let* ((char (schar string i)) 2118 (code (char-code char))) 2119 (declare (type (mod #x110000) code) 2120 (fixnum highbits)) 2121 (cond ((< code #x10000) 2122 (setf (aref vector idx) code) 2123 (incf idx)) 2124 (t 2125 (return nil))))))) 2126 :vector-decode-function 2127 (nfunction 2128 ucs-2-vector-decode 2129 (lambda (vector idx nunits string) 2130 (declare (type (simple-array (unsigned-byte 16) (*)) vector) 2131 (type index idx)) 2132 (let* ((len (length vector)) 2133 (swap (if (> len idx) 2134 (case (aref vector idx) 2135 (#.byte-order-mark-char-code 2136 (incf idx) nil) 2137 (#.swapped-byte-order-mark-char-code 2138 (incf idx t)) 2139 (t #+little-endian-target t))))) 2140 2141 (do* ((i 0 (1+ i)) 2142 (index idx (1+ index))) 2143 ((>= i nunits) (values string index)) 2144 (declare (fixnum i len index)) 2145 (if (>= index len) 2146 (values nil idx) 2147 (let* ((1st-unit (aref vector index))) 2148 (declare (type (unsigned-byte 16) 1st-unit)) 2149 (if swap (setq 1st-unit (%swap-u16 1st-unit))) 2150 (let* ((char (code-char 1st-unit))) 2151 (if char 2152 (setf (schar string i) char) 2153 (return (values nil idx)))))))))) 2154 :memory-encode-function 2155 (nfunction 2156 ucs-2-memory-encode 2157 (lambda (string pointer idx &optional (start 0) (end (length string))) 2158 (declare (fixnum idx)) 2159 (when (> end start) 2160 (setf (%get-unsigned-word pointer (+ idx idx)) 2161 byte-order-mark-char-code) 2162 (incf idx)) 2163 (do* ((i start (1+ i))) 2164 ((>= i end) idx) 2165 (let* ((code (char-code (schar string i))) 2166 (p (+ idx idx))) 2167 (declare (type (mod #x110000) code) 2168 (fixnum p)) 2169 (cond ((< code #x10000) 2170 (setf (%get-unsigned-word pointer p) code) 2171 (incf idx) 2172 (incf p 2)) 2173 (t 2174 (return (values nil idx)))))))) 2175 :memory-decode-function 2176 (nfunction 2177 ucs-2-memory-decode 2178 (lambda (pointer nunits idx string) 2179 (declare (fixnum nunits idx)) 2180 (let* ((swap (when (> nunits 0) 2181 (case (%get-unsigned-word pointer (+ idx idx)) 2182 (#.byte-order-mark-char-code 2183 (incf idx) 2184 (decf nunits) 2185 nil) 2186 (#.swapped-byte-order-mark-char-code 2187 (incf idx) 2188 (decf nunits) 2189 t) 2190 (t #+little-endian-target t))))) 2191 (do* ((i 0 (1+ i)) 2192 (index idx (1+ index)) 2193 (p (+ index index) (+ p 2))) 2194 ((>= i nunits) (values string index)) 2195 (declare (fixnum i index p)) 2196 (let* ((1st-unit (%get-unsigned-word pointer p))) 2197 (declare (type (unsigned-byte 16) 1st-unit)) 2198 (if swap (setq 1st-unit (%swap-u16 1st-unit))) 2199 (let* ((char (code-char 1st-unit))) 2200 (if char 2201 (setf (schar string i) char) 2202 (return (values nil idx))))))))) 2203 :units-in-string-function 2204 #'(lambda (&rest args) 2205 (declare (dynamic-extent args)) 2206 ;; Add one for the BOM. 2207 (1+ (apply #'ucs-2-units-in-string args))) 2208 :length-of-vector-encoding-function 2209 (nfunction 2210 ucs-2-length-of-vector-encoding 2211 (lambda (vector &optional (start 0) (end (length vector))) 2212 (declare (type (simple-array (unsigned-byte 16) (*)) vector)) 2213 (let* ((swap (when (> end start) 2214 (case (aref vector start) 2215 (#.byte-order-mark-char-code 2216 (incf start) 2217 nil) 2218 (#.swapped-byte-order-mark-char-code 2219 (incf start) 2220 t) 2221 (t #+little-endian-target t))))) 2222 (do* ((i start) 2223 (nchars 0 (1+ nchars))) 2224 ((>= i end) 2225 (if (= i end) nchars)) 2226 (let* ((code (aref vector i))) 2227 (declare (type (unsigned-byte 16) code)) 2228 (if swap (setq code (%swap-u16 code))) 2229 (incf i)))))) 2230 :length-of-memory-encoding-function 2231 (nfunction 2232 ucs-2-length-of-memory-encoding 2233 (lambda (pointer nunits &optional (start 0)) 2234 (when (> nunits 1) 2235 (case (%get-unsigned-word pointer (+ start start)) 2236 (#.byte-order-mark-char-code 2237 (incf start) 2238 (decf nunits) 2239 nil) 2240 (#.swapped-byte-order-mark-char-code 2241 (incf start) 2242 (decf nunits) 2243 t) 2244 (t #+little-endian-target t))) 2245 (do* ((i start (1+ i)) 2246 (p (+ start start) (+ p 2)) 2247 (nchars 0 (1+ nchars))) 2248 ((>= i nunits) 2249 (if (= i nunits) nchars))))) 2250 :literal-char-code-limit #x10000 2251 :use-byte-order-mark 2252 #+big-endian-target :ucs-2le 2253 #+little-endian-target :ucs-2be 2254 ) 2255 2256 (defun describe-character-encoding (name) 2257 (let* ((enc (lookup-character-encoding name))) 2258 (when enc 2259 (let* ((name (character-encoding-name enc)) 2260 (doc (character-encoding-documentation enc)) 2261 (aliases (character-encoding-aliases enc))) 2262 (format t "~&~s" name) 2263 (when (null (car aliases)) 2264 (pop aliases)) 2265 (when aliases 2266 (format t " [Aliases:~{ ~s~}]" aliases)) 2267 (format t "~&~a~%~%" doc) 2268 (values))))) 2269 1844 2270 (defun describe-character-encodings () 1845 (let* (( encodings nil))2271 (let* ((names nil)) 1846 2272 (maphash #'(lambda (name enc) 1847 (when name 1848 (push (cons name (character-encoding-documentation enc)) 1849 encodings))) 2273 (when (eq name (character-encoding-name enc)) 2274 (push name names))) 1850 2275 *character-encodings*) 1851 (dolist (pair (sort encodings #'string< :key #'car)) 1852 (format t "~&~s~&~a~%~%" (car pair) (cdr pair))))) 2276 (dolist (name (sort names #'string<) (values)) 2277 (describe-character-encoding name)))) 2278 2279 (defmethod make-load-form ((c character-encoding) &optional environment) 2280 (declare (ignore environment)) 2281 `(get-character-encoding ,(character-encoding-name c))) 2282 2283 (defun cstring-encoded-length-in-bytes (encoding string start end) 2284 (ash (+ 1 ; NULL terminator 2285 (funcall (character-encoding-units-in-string-function encoding) 2286 string 2287 (or start 0) 2288 (or end (length string)))) 2289 (case (character-encoding-code-unit-size encoding) 2290 (8 0) 2291 (16 1) 2292 (32 2)))) 2293 2294 (defun encode-string-to-memory (encoding pointer offset string start end) 2295 (funcall (character-encoding-memory-encode-function encoding) 2296 string pointer offset (or start 0) (or end (length string))))
Note:
See TracChangeset
for help on using the changeset viewer.
