Changeset 5353
- Timestamp:
- Oct 15, 2006, 4:51:04 PM (18 years ago)
- File:
-
- 1 edited
-
trunk/ccl/level-1/l1-unicode.lisp (modified) (8 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/ccl/level-1/l1-unicode.lisp
r5335 r5353 108 108 (defconstant swapped-byte-order-mark-char-code (char-code swapped-byte-order-mark)) 109 109 110 111 (defmethod default-character-encoding ((domain t)) 112 (character-encoding-name (get-character-encoding nil))) 110 113 111 114 (defun decode-character-encoded-vector (encoding vector start-index noctets string) … … 1691 1694 indicated by the endianness of a byte-order-mark character (#\u+feff) 1692 1695 prepended to the data; in the absence of such a character on input, 1693 the data is assumed to be in big-endian order." 1696 the data is assumed to be in big-endian order. Output is written 1697 in native byte-order with a leading byte-order mark." 1694 1698 :max-units-per-char 2 1695 1699 :code-unit-size 16 … … 1920 1924 (define-character-encoding #+big-endian-target :ucs-2be #-big-endian-target :ucs-2le 1921 1925 #+big-endian-target 1922 "A 16-bit, variable-length encoding in which characters with1926 "A 16-bit, fixed-length encoding in which characters with 1923 1927 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit 1924 little-endian word. The encoded data is implicitly big-endian;1928 big-endian word. The encoded data is implicitly big-endian; 1925 1929 byte-order-mark characters are not interpreted on input or prepended 1926 1930 to output." 1927 1931 #+little-endian-target 1928 "A 16-bit, variable-length encoding in which characters with1932 "A 16-bit, fixed-length encoding in which characters with 1929 1933 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit 1930 1934 little-endian word. The encoded data is implicitly little-endian; … … 2016 2020 (define-character-encoding #+big-endian-target :ucs-2le #-big-endian-target :ucs-2be 2017 2021 #+little-endian-target 2018 "A 16-bit, variable-length encoding in which characters with2022 "A 16-bit, fixed-length encoding in which characters with 2019 2023 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit 2020 little-endian word. The encoded data is implicitly big-endian;2024 big-endian word. The encoded data is implicitly big-endian; 2021 2025 byte-order-mark characters are not interpreted on input or prepended 2022 2026 to output." 2023 2027 #+big-endian-target 2024 "A 16-bit, variable-length encoding in which characters with2028 "A 16-bit, fixed-length encoding in which characters with 2025 2029 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit 2030 2026 2031 little-endian word. The encoded data is implicitly little-endian; 2027 2032 byte-order-mark characters are not interpreted on input or prepended … … 2088 2093 (let* ((1st-unit (%swap-u16 (%get-unsigned-word pointer index)))) 2089 2094 (declare (type (unsigned-byte 16) 1st-unit)) 2090 (setf (schar string i) (or (c har-code1st-unit) #\Replacement_Character))))))2095 (setf (schar string i) (or (code-char 1st-unit) #\Replacement_Character)))))) 2091 2096 :octets-in-string-function 2092 2097 #'ucs-2-octets-in-string … … 2111 2116 (define-character-encoding :ucs-2 2112 2117 "A 16-bit, fixed-length encoding in which characters with 2113 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit word s.2118 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit word. 2114 2119 The endianness of the encoded data is indicated by the endianness of a 2115 2120 byte-order-mark character (#\u+feff) prepended to the data; in the … … 2232 2237 (decf noctets 2)) 2233 2238 (#.swapped-byte-order-mark-char-code 2234 (incf start )2235 (decf noctets ))))2239 (incf start 2) 2240 (decf noctets 2)))) 2236 2241 (values (floor noctets 2) (+ start noctets)))) 2237 2242 :literal-char-code-limit #x10000 … … 2239 2244 #+big-endian-target :ucs-2le 2240 2245 #+little-endian-target :ucs-2be 2246 ) 2247 2248 2249 (defun ucs-4-stream-encode (char write-function stream) 2250 (let* ((code (char-code char))) 2251 (declare (type (mod #x110000) code)) 2252 (funcall write-function stream code) 2253 1)) 2254 2255 (defun ucs-4-stream-decode (1st-unit next-unit-function stream) 2256 (declare (type (unsigned-byte 16) 1st-unit) 2257 (ignore next-unit-function stream)) 2258 (code-char 1st-unit)) 2259 2260 2261 (defun ucs-4-octets-in-string (string start end) 2262 (declare (ignore string)) 2263 (if (>= end start) 2264 (* 4 (- end start)) 2265 0)) 2266 2267 2268 (declaim (inline %big-endian-u8-ref-u32 %little-endian-u8-ref-u32)) 2269 (defun %big-endian-u8-ref-u32 (u8-vector idx) 2270 (declare (type (simple-array (unsigned-byte 8) (*)) u8-vector) 2271 (fixnum idx)) 2272 (logior (the (unsigned-byte 32) (ash (the (unsigned-byte 8) (aref u8-vector idx)) 24)) 2273 (the (unsigned-byte 24) 2274 (logior 2275 (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (1+ idx)))) 16) 2276 (the (unsigned-byte 16) 2277 (logior 2278 (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (+ idx 2)))) 8) 2279 (the (unsigned-byte 8) (aref u8-vector (the fixnum (+ idx 3)))))))))) 2280 2281 (defun %little-endian-u8-ref-u32 (u8-vector idx) 2282 (declare (type (simple-array (unsigned-byte 8) (*)) u8-vector) 2283 (fixnum idx)) 2284 (logior (the (unsigned-byte 32) (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (+ idx 3)))) 24)) 2285 (the (unsigned-byte 24) 2286 (logior 2287 (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (+ idx 2)))) 16) 2288 (the (unsigned-byte 16) 2289 (logior 2290 (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (1+ idx)))) 8) 2291 (the (unsigned-byte 8) (aref u8-vector (the fixnum idx))))))))) 2292 2293 #+big-endian-target 2294 (progn 2295 (defmacro %native-u8-ref-u32 (vector idx) 2296 `(%big-endian-u8-ref-u32 ,vector ,idx)) 2297 2298 (defmacro %reversed-u8-ref-u32 (vector idx) 2299 `(%little-endian-u8-ref-u32 ,vector ,idx)) 2300 ) 2301 2302 #+little-endian-target 2303 (progn 2304 (defmacro %native-u8-ref-u32 (vector idx) 2305 `(%little-endian-u8-ref-u32 ,vector ,idx)) 2306 2307 (defmacro %reversed-u8-ref-u32 (vector idx) 2308 `(%big-endian-u8-ref-u32 ,vector ,idx)) 2309 ) 2310 2311 2312 (declaim (inline (setf %big-endian-u8-ref-32) (setf %little-endian-u8-ref-u32))) 2313 (defun (setf %big-endian-u8-ref-u32) (val u8-vector idx) 2314 (declare (type (unsigned-byte 32) val) 2315 (type (simple-array (unsigned-byte 8) (*)) u8-vector) 2316 (fixnum idx)) 2317 (setf (aref u8-vector idx) (ldb (byte 8 24) val) 2318 (aref u8-vector (the fixnum (1+ idx))) (ldb (byte 8 16) val) 2319 (aref u8-vector (the fixnum (+ idx 2))) (ldb (byte 8 8) val) 2320 (aref u8-vector (the fixnum (+ idx 3))) (ldb (byte 8 0) val)) 2321 val) 2322 2323 (defun (setf %little-endian-u8-ref-u32) (val u8-vector idx) 2324 (declare (type (unsigned-byte 16) val) 2325 (type (simple-array (unsigned-byte 8) (*)) u8-vector) 2326 (fixnum idx)) 2327 (setf (aref u8-vector idx) (ldb (byte 8 0) val) 2328 (aref u8-vector (the fixnum (1+ idx))) (ldb (byte 8 8) val) 2329 (aref u8-vector (the fixnum (+ idx 2))) (ldb (byte 8 16) val) 2330 (aref u8-vector (the fixnum (+ idx 3))) (ldb (byte 8 24) val)) 2331 val) 2332 2333 2334 ;;; UTF-32/UCS-4, native byte order 2335 (define-character-encoding #+big-endian-target :utf32-be #-big-endian-target :utf32-le 2336 #+big-endian-target 2337 "A 32-bit, fixed-length encoding in which all Unicode characters 2338 encoded in a single 32-bit word. The encoded data is implicitly big-endian; 2339 byte-order-mark characters are not interpreted on input or prepended 2340 to output." 2341 #+little-endian-target 2342 "A 32-bit, fixed-length encoding in which all Unicode characters can 2343 encoded in a single 32-bit word. The encoded data is implicitly 2344 little-endian; byte-order-mark characters are not interpreted on input 2345 or prepended to output." 2346 :aliases #+big-endian-target '(:ucs-4be) #+little-endian-target '(:ucs-4le) 2347 :max-units-per-char 1 2348 :code-unit-size 32 2349 :native-endianness t 2350 :stream-encode-function 2351 #'ucs-4-stream-encode 2352 :Stream-decode-function 2353 #'ucs-4-stream-decode 2354 :vector-encode-function 2355 (nfunction 2356 native-ucs-4-vector-encode 2357 (lambda (string vector idx start end) 2358 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 2359 (fixnum idx)) 2360 (do* ((i start (1+ i))) 2361 ((>= i end) idx) 2362 (let* ((char (schar string i)) 2363 (code (char-code char))) 2364 (declare (type (mod #x110000) code)) 2365 (setf (%native-u8-ref-u32 vector idx) code) 2366 (incf idx 4))))) 2367 :vector-decode-function 2368 (nfunction 2369 native-ucs-4-vector-decode 2370 (lambda (vector idx noctets string) 2371 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 2372 (type index idx)) 2373 (do* ((i 0 (1+ i)) 2374 (end (+ idx noctets)) 2375 (index idx (+ 4 index))) 2376 ((>= index end) index) 2377 (declare (fixnum i len index)) 2378 (let* ((code (%native-u8-ref-u32 vector index))) 2379 (declare (type (unsigned-byte 32) code)) 2380 (setf (schar string i) 2381 (or (if (< code char-code-limit) 2382 (code-char code)) 2383 #\Replacement_Character)))))) 2384 :memory-encode-function 2385 (nfunction 2386 native-ucs-4-memory-encode 2387 (lambda (string pointer idx start end) 2388 (declare (fixnum idx)) 2389 (do* ((i start (1+ i))) 2390 ((>= i end) idx) 2391 (let* ((code (char-code (schar string i)))) 2392 (declare (type (mod #x110000) code)) 2393 (setf (%get-unsigned-long pointer idx) code) 2394 (incf idx 4))))) 2395 :memory-decode-function 2396 (nfunction 2397 native-ucs-4-memory-decode 2398 (lambda (pointer noctets idx string) 2399 (declare (fixnum noctets idx)) 2400 (do* ((i 0 (1+ i)) 2401 (index idx (+ index 4))) 2402 ((>= i noctets) index) 2403 (declare (fixnum i index)) 2404 (let* ((1st-unit (%get-unsigned-long pointer index))) 2405 (declare (type (unsigned-byte 32) 1st-unit)) 2406 (setf (schar string i) (or (if (< 1st-unit char-code-limit) 2407 (code-char 1st-unit)) 2408 #\Replacement_Character)))))) 2409 :octets-in-string-function 2410 #'ucs-4-octets-in-string 2411 :length-of-vector-encoding-function 2412 (nfunction 2413 native-ucs-4-length-of-vector-encoding 2414 (lambda (vector start end) 2415 (declare (ignore vector)) 2416 (do* ((i start (1+ i)) 2417 (j (+ i 4) (+ i 4)) 2418 (nchars 0 (1+ nchars))) 2419 ((> j end) (values nchars i))))) 2420 :length-of-memory-encoding-function 2421 (nfunction 2422 native-ucs-4-length-of-memory-encoding 2423 (lambda (pointer noctets start) 2424 (declare (ignore pointer)) 2425 (values (floor noctets 4) (+ start noctets)))) 2426 :literal-char-code-limit #x110000 2427 ) 2428 2429 ;;; UTF-32/UCS-4, reversed byte order 2430 (define-character-encoding #+big-endian-target :utf32-le #-big-endian-target :utf32-be 2431 #+little-endian-target 2432 "A 32-bit, fixed-length encoding in which all Unicode characters 2433 encoded in a single 32-bit word. The encoded data is implicitly big-endian; 2434 byte-order-mark characters are not interpreted on input or prepended 2435 to output." 2436 #+big-endian-target 2437 "A 32-bit, fixed-length encoding in which all Unicode characters can 2438 encoded in a single 32-bit word. The encoded data is implicitly 2439 little-endian; byte-order-mark characters are not interpreted on input 2440 or prepended to output." 2441 :aliases #+big-endian-target '(:ucs-4le) #+little-endian-target '(:ucs-4be) 2442 :max-units-per-char 1 2443 :code-unit-size 32 2444 :native-endianness nil 2445 :stream-encode-function 2446 #'ucs-4-stream-encode 2447 :Stream-decode-function 2448 #'ucs-4-stream-decode 2449 :vector-encode-function 2450 (nfunction 2451 native-ucs-4-vector-encode 2452 (lambda (string vector idx start end) 2453 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 2454 (fixnum idx)) 2455 (do* ((i start (1+ i))) 2456 ((>= i end) idx) 2457 (let* ((char (schar string i)) 2458 (code (char-code char))) 2459 (declare (type (mod #x110000) code)) 2460 (setf (%reversed-u8-ref-u32 vector idx) code) 2461 (incf idx 4))))) 2462 :vector-decode-function 2463 (nfunction 2464 native-ucs-4-vector-decode 2465 (lambda (vector idx noctets string) 2466 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 2467 (type index idx)) 2468 (do* ((i 0 (1+ i)) 2469 (end (+ idx noctets)) 2470 (index idx (+ 4 index))) 2471 ((>= index end) index) 2472 (declare (fixnum i len index)) 2473 (let* ((code (%reversed-u8-ref-u32 vector index))) 2474 (declare (type (unsigned-byte 32) code)) 2475 (setf (schar string i) 2476 (or (if (< code char-code-limit) 2477 (code-char code)) 2478 #\Replacement_Character)))))) 2479 :memory-encode-function 2480 (nfunction 2481 native-ucs-4-memory-encode 2482 (lambda (string pointer idx start end) 2483 (declare (fixnum idx)) 2484 (do* ((i start (1+ i))) 2485 ((>= i end) idx) 2486 (let* ((code (char-code (schar string i)))) 2487 (declare (type (mod #x110000) code)) 2488 (setf (%get-unsigned-long pointer idx) (%swap-u32 code)) 2489 (incf idx 4))))) 2490 :memory-decode-function 2491 (nfunction 2492 reversed-ucs-4-memory-decode 2493 (lambda (pointer noctets idx string) 2494 (declare (fixnum noctets idx)) 2495 (do* ((i 0 (1+ i)) 2496 (index idx (+ index 4))) 2497 ((>= i noctets) index) 2498 (declare (fixnum i index)) 2499 (let* ((1st-unit (%swap-u32 (%get-unsigned-long pointer index)))) 2500 (declare (type (unsigned-byte 32) 1st-unit)) 2501 (setf (schar string i) (or (if (< 1st-unit char-code-limit) 2502 (code-char 1st-unit)) 2503 #\Replacement_Character)))))) 2504 2505 :octets-in-string-function 2506 #'ucs-4-octets-in-string 2507 :length-of-vector-encoding-function 2508 (nfunction 2509 reversed-ucs-4-length-of-vector-encoding 2510 (lambda (vector start end) 2511 (declare (ignore vector)) 2512 (do* ((i start (1+ i)) 2513 (j (+ i 4) (+ i 4)) 2514 (nchars 0 (1+ nchars))) 2515 ((> j end) (values nchars i))))) 2516 :length-of-memory-encoding-function 2517 (nfunction 2518 reversed-ucs-4-length-of-memory-encoding 2519 (lambda (pointer noctets start) 2520 (declare (ignore pointer)) 2521 (values (floor noctets 4) (+ start noctets)))) 2522 :literal-char-code-limit #x110000 2523 ) 2524 2525 (define-character-encoding :utf-32 2526 "A 32-bit, fixed-length encoding in which all Unicode characters can be encoded in a single 32-bit word. The endianness of the encoded data is indicated by the endianness of a byte-order-mark character (#\u+feff) prepended to the data; in the absence of such a character on input, input data is assumed to be in big-endian order. Output is written in native byte order with a leading byte-order mark." 2527 2528 :aliases '(:utf-4) 2529 :max-units-per-char 1 2530 :code-unit-size 32 2531 :native-endianness t ;not necessarily true. 2532 :stream-encode-function 2533 #+ucs-4-stream-encode 2534 :stream-decode-function 2535 #'ucs-4-stream-decode 2536 :vector-encode-function 2537 (nfunction 2538 utf-32-vector-encode 2539 (lambda (string vector idx start end) 2540 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 2541 (fixnum idx)) 2542 (when (> end start) 2543 (setf (%native-u8-ref-u32 vector idx) byte-order-mark-char-code) 2544 (incf idx 4)) 2545 (do* ((i start (1+ i))) 2546 ((>= i end) idx) 2547 (let* ((char (schar string i)) 2548 (code (char-code char))) 2549 (declare (type (mod #x110000) code)) 2550 (setf (%native-u8-ref-u32 vector idx) code) 2551 (incf idx 4))))) 2552 :vector-decode-function 2553 (nfunction 2554 utf-32-vector-decode 2555 (lambda (vector idx noctets string) 2556 (declare (type (simple-array (unsigned-byte 8) (*)) vector) 2557 (type index idx) 2558 (fixnum noctets)) 2559 (let* ((swap (if (> noctets 3) 2560 (case (%native-u8-ref-u32 vector idx) 2561 (#.byte-order-mark-char-code 2562 (incf idx 4) (decf noctets 4) nil) 2563 (#.swapped-byte-order-mark-char-code 2564 (incf idx 4) (decf noctets 4) t) 2565 (t #+little-endian-target t))))) 2566 2567 (do* ((i 0 (1+ i)) 2568 (end (+ idx noctets)) 2569 (index idx (1+ index))) 2570 ((>= index end) index) 2571 (declare (fixnum i len index)) 2572 (let* ((1st-unit (if swap 2573 (%reversed-u8-ref-u32 vector index) 2574 (%native-u8-ref-u32 vector index)))) 2575 (declare (type (unsigned-byte 32) 1st-unit)) 2576 (setf (schar string i) (or (if (< 1st-unit char-code-limit) 2577 (code-char 1st-unit)) 2578 #\Replacement_Character))))))) 2579 :memory-encode-function 2580 (nfunction 2581 utf-32-memory-encode 2582 (lambda (string pointer idx start end) 2583 (declare (fixnum idx)) 2584 (when (> end start) 2585 (setf (%get-unsigned-long pointer idx) 2586 byte-order-mark-char-code) 2587 (incf idx 4)) 2588 (do* ((i start (1+ i))) 2589 ((>= i end) idx) 2590 (let* ((code (char-code (schar string i)))) 2591 (declare (type (mod #x110000) code)) 2592 (setf (%get-unsigned-long pointer idx) code) 2593 (incf idx 4))))) 2594 :memory-decode-function 2595 (nfunction 2596 utf-32-memory-decode 2597 (lambda (pointer noctets idx string) 2598 (declare (fixnum noctets idx)) 2599 (let* ((swap (when (> noctets 3) 2600 (case (%get-unsigned-long pointer idx) 2601 (#.byte-order-mark-char-code 2602 (incf idx 4) 2603 (decf noctets 4) 2604 nil) 2605 (#.swapped-byte-order-mark-char-code 2606 (incf idx 4) 2607 (decf noctets 4) 2608 t) 2609 (t #+little-endian-target t))))) 2610 (do* ((i 0 (1+ i)) 2611 (index idx (+ index 2))) 2612 ((>= i noctets) index) 2613 (declare (fixnum i index)) 2614 (let* ((1st-unit (%get-unsigned-long pointer index))) 2615 (declare (type (unsigned-byte 32) 1st-unit)) 2616 (if swap (setq 1st-unit (%swap-u32 1st-unit))) 2617 (setf (schar string i) (or (if (< 1st-unit char-code-limit) 2618 (code-char 1st-unit)) 2619 #\Replacement_Character))))))) 2620 :octets-in-string-function 2621 #'(lambda (&rest args) 2622 (declare (dynamic-extent args)) 2623 ;; Add four for the BOM. 2624 (+ 4 (apply #'ucs-4-octets-in-string args))) 2625 :length-of-vector-encoding-function 2626 (nfunction 2627 utf-32-length-of-vector-encoding 2628 (lambda (vector start end) 2629 (declare (ignore vector)) 2630 (do* ((i start (1+ i)) 2631 (j (+ i 2) (+ i 2)) 2632 (nchars 0 (1+ nchars))) 2633 ((> j end) (values nchars i))))) 2634 :length-of-memory-encoding-function 2635 (nfunction 2636 utf-32-length-of-memory-encoding 2637 (lambda (pointer noctets start) 2638 (when (> noctets 1) 2639 (case (%get-unsigned-long pointer ) 2640 (#.byte-order-mark-char-code 2641 (incf start 4) 2642 (decf noctets 4)) 2643 (#.swapped-byte-order-mark-char-code 2644 (incf start 4) 2645 (decf noctets 4)))) 2646 (values (floor noctets 4) (+ start noctets)))) 2647 :literal-char-code-limit #x110000 2648 :use-byte-order-mark 2649 #+big-endian-target :utf-32le 2650 #+little-endian-target :utf-32be 2241 2651 ) 2242 2652
Note:
See TracChangeset
for help on using the changeset viewer.
