Changeset 5353


Ignore:
Timestamp:
Oct 15, 2006, 4:51:04 PM (18 years ago)
Author:
Gary Byers
Message:

utf-32/ucs-4 and variants.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/ccl/level-1/l1-unicode.lisp

    r5335 r5353  
    108108(defconstant swapped-byte-order-mark-char-code (char-code swapped-byte-order-mark))
    109109
     110
     111(defmethod default-character-encoding ((domain t))
     112  (character-encoding-name (get-character-encoding nil)))
    110113
    111114(defun decode-character-encoded-vector (encoding vector start-index noctets string)
     
    16911694indicated by the endianness of a byte-order-mark character (#\u+feff)
    16921695prepended to the data; in the absence of such a character on input,
    1693 the data is assumed to be in big-endian order."   
     1696the data is assumed to be in big-endian order. Output is written
     1697in native byte-order with a leading byte-order mark."   
    16941698  :max-units-per-char 2
    16951699  :code-unit-size 16
     
    19201924(define-character-encoding #+big-endian-target :ucs-2be #-big-endian-target :ucs-2le
    19211925  #+big-endian-target
    1922   "A 16-bit, variable-length encoding in which characters with
     1926  "A 16-bit, fixed-length encoding in which characters with
    19231927CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
    1924 little-endian word. The encoded data is implicitly big-endian;
     1928big-endian word. The encoded data is implicitly big-endian;
    19251929byte-order-mark characters are not interpreted on input or prepended
    19261930to output."
    19271931  #+little-endian-target
    1928   "A 16-bit, variable-length encoding in which characters with
     1932  "A 16-bit, fixed-length encoding in which characters with
    19291933CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
    19301934little-endian word. The encoded data is implicitly little-endian;
     
    20162020(define-character-encoding #+big-endian-target :ucs-2le #-big-endian-target :ucs-2be
    20172021  #+little-endian-target
    2018   "A 16-bit, variable-length encoding in which characters with
     2022  "A 16-bit, fixed-length encoding in which characters with
    20192023CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
    2020 little-endian word. The encoded data is implicitly big-endian;
     2024big-endian word. The encoded data is implicitly big-endian;
    20212025byte-order-mark characters are not interpreted on input or prepended
    20222026to output."
    20232027  #+big-endian-target
    2024   "A 16-bit, variable-length encoding in which characters with
     2028  "A 16-bit, fixed-length encoding in which characters with
    20252029CHAR-CODEs less than #x10000 can be encoded in a single 16-bit
     2030
    20262031little-endian word. The encoded data is implicitly little-endian;
    20272032byte-order-mark characters are not interpreted on input or prepended
     
    20882093       (let* ((1st-unit (%swap-u16 (%get-unsigned-word pointer index))))
    20892094         (declare (type (unsigned-byte 16) 1st-unit))
    2090          (setf (schar string i) (or (char-code 1st-unit) #\Replacement_Character))))))
     2095         (setf (schar string i) (or (code-char 1st-unit) #\Replacement_Character))))))
    20912096  :octets-in-string-function
    20922097  #'ucs-2-octets-in-string
     
    21112116(define-character-encoding :ucs-2
    21122117    "A 16-bit, fixed-length encoding in which characters with
    2113 CHAR-CODEs less than #x10000 can be encoded in a single 16-bit words.
     2118CHAR-CODEs less than #x10000 can be encoded in a single 16-bit word.
    21142119The endianness of the encoded data is indicated by the endianness of a
    21152120byte-order-mark character (#\u+feff) prepended to the data; in the
     
    22322237          (decf noctets 2))
    22332238         (#.swapped-byte-order-mark-char-code
    2234           (incf start)
    2235           (decf noctets))))
     2239          (incf start 2)
     2240          (decf noctets 2))))
    22362241     (values (floor noctets 2) (+ start noctets))))
    22372242  :literal-char-code-limit #x10000
     
    22392244  #+big-endian-target :ucs-2le
    22402245  #+little-endian-target :ucs-2be
     2246  )
     2247
     2248
     2249(defun ucs-4-stream-encode (char write-function stream)
     2250  (let* ((code (char-code char)))
     2251    (declare (type (mod #x110000) code))
     2252    (funcall write-function stream code)
     2253    1))
     2254
     2255(defun ucs-4-stream-decode (1st-unit next-unit-function stream)
     2256  (declare (type (unsigned-byte 16) 1st-unit)
     2257           (ignore next-unit-function stream))
     2258  (code-char 1st-unit))
     2259
     2260
     2261(defun ucs-4-octets-in-string (string start end)
     2262  (declare (ignore string))
     2263  (if (>= end start)
     2264    (* 4 (- end start))
     2265    0))
     2266
     2267
     2268(declaim (inline %big-endian-u8-ref-u32 %little-endian-u8-ref-u32))
     2269(defun %big-endian-u8-ref-u32 (u8-vector idx)
     2270  (declare (type (simple-array (unsigned-byte 8) (*)) u8-vector)
     2271           (fixnum idx))
     2272  (logior (the (unsigned-byte 32) (ash (the (unsigned-byte 8) (aref u8-vector idx)) 24))
     2273          (the (unsigned-byte 24)
     2274            (logior
     2275             (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (1+ idx)))) 16)
     2276             (the (unsigned-byte 16)
     2277               (logior
     2278                (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (+ idx 2)))) 8)
     2279                (the (unsigned-byte 8) (aref u8-vector (the fixnum (+ idx 3))))))))))
     2280
     2281(defun %little-endian-u8-ref-u32 (u8-vector idx)
     2282  (declare (type (simple-array (unsigned-byte 8) (*)) u8-vector)
     2283           (fixnum idx))
     2284  (logior (the (unsigned-byte 32) (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (+ idx 3)))) 24))
     2285          (the (unsigned-byte 24)
     2286            (logior
     2287             (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (+ idx 2)))) 16)
     2288             (the (unsigned-byte 16)
     2289               (logior
     2290                (ash (the (unsigned-byte 8) (aref u8-vector (the fixnum (1+ idx)))) 8)
     2291                (the (unsigned-byte 8) (aref u8-vector (the fixnum idx)))))))))
     2292
     2293#+big-endian-target
     2294(progn
     2295(defmacro %native-u8-ref-u32 (vector idx)
     2296  `(%big-endian-u8-ref-u32 ,vector ,idx))
     2297
     2298(defmacro %reversed-u8-ref-u32 (vector idx)
     2299  `(%little-endian-u8-ref-u32 ,vector ,idx))
     2300)
     2301
     2302#+little-endian-target
     2303(progn
     2304(defmacro %native-u8-ref-u32 (vector idx)
     2305  `(%little-endian-u8-ref-u32 ,vector ,idx))
     2306
     2307(defmacro %reversed-u8-ref-u32 (vector idx)
     2308  `(%big-endian-u8-ref-u32 ,vector ,idx))
     2309)
     2310
     2311
     2312(declaim (inline (setf %big-endian-u8-ref-32) (setf %little-endian-u8-ref-u32)))
     2313(defun (setf %big-endian-u8-ref-u32) (val u8-vector idx)
     2314  (declare (type (unsigned-byte 32) val)
     2315           (type (simple-array (unsigned-byte 8) (*)) u8-vector)
     2316           (fixnum idx))
     2317  (setf (aref u8-vector idx) (ldb (byte 8 24) val)
     2318        (aref u8-vector (the fixnum (1+ idx))) (ldb (byte 8 16) val)
     2319        (aref u8-vector (the fixnum (+ idx 2))) (ldb (byte 8 8) val)
     2320        (aref u8-vector (the fixnum (+ idx 3))) (ldb (byte 8 0) val))
     2321  val)
     2322
     2323(defun (setf %little-endian-u8-ref-u32) (val u8-vector idx)
     2324  (declare (type (unsigned-byte 16) val)
     2325           (type (simple-array (unsigned-byte 8) (*)) u8-vector)
     2326           (fixnum idx))
     2327  (setf (aref u8-vector idx) (ldb (byte 8 0) val)
     2328        (aref u8-vector (the fixnum (1+ idx))) (ldb (byte 8 8) val)
     2329        (aref u8-vector (the fixnum (+ idx 2))) (ldb (byte 8 16) val)
     2330        (aref u8-vector (the fixnum (+ idx 3))) (ldb (byte 8 24) val))
     2331  val)
     2332
     2333
     2334;;; UTF-32/UCS-4, native byte order
     2335(define-character-encoding #+big-endian-target :utf32-be #-big-endian-target :utf32-le
     2336  #+big-endian-target
     2337  "A 32-bit, fixed-length encoding in which all Unicode characters
     2338encoded in a single 32-bit word. The encoded data is implicitly big-endian;
     2339byte-order-mark characters are not interpreted on input or prepended
     2340to output."
     2341  #+little-endian-target
     2342  "A 32-bit, fixed-length encoding in which all Unicode characters can
     2343encoded in a single 32-bit word. The encoded data is implicitly
     2344little-endian; byte-order-mark characters are not interpreted on input
     2345or prepended to output."
     2346  :aliases #+big-endian-target '(:ucs-4be) #+little-endian-target '(:ucs-4le)
     2347  :max-units-per-char 1
     2348  :code-unit-size 32
     2349  :native-endianness t
     2350  :stream-encode-function
     2351  #'ucs-4-stream-encode
     2352  :Stream-decode-function
     2353  #'ucs-4-stream-decode
     2354  :vector-encode-function
     2355  (nfunction
     2356   native-ucs-4-vector-encode
     2357   (lambda (string vector idx start end)
     2358     (declare (type (simple-array (unsigned-byte 8) (*)) vector)
     2359              (fixnum idx))
     2360     (do* ((i start (1+ i)))
     2361          ((>= i end) idx)
     2362       (let* ((char (schar string i))
     2363              (code (char-code char)))
     2364         (declare (type (mod #x110000) code))
     2365         (setf (%native-u8-ref-u32 vector idx) code)
     2366         (incf idx 4)))))
     2367  :vector-decode-function
     2368  (nfunction
     2369   native-ucs-4-vector-decode
     2370   (lambda (vector idx noctets string)
     2371     (declare (type (simple-array (unsigned-byte 8) (*)) vector)
     2372              (type index idx))
     2373     (do* ((i 0 (1+ i))
     2374           (end (+ idx noctets))
     2375           (index idx (+ 4 index)))
     2376          ((>= index end) index)
     2377       (declare (fixnum i len index))
     2378       (let* ((code (%native-u8-ref-u32 vector index)))
     2379         (declare (type (unsigned-byte 32) code))
     2380         (setf (schar string i)
     2381               (or (if (< code char-code-limit)
     2382                      (code-char code))
     2383                   #\Replacement_Character))))))
     2384  :memory-encode-function
     2385  (nfunction
     2386   native-ucs-4-memory-encode
     2387   (lambda (string pointer idx start end)
     2388     (declare (fixnum idx))
     2389     (do* ((i start (1+ i)))
     2390          ((>= i end) idx)
     2391       (let* ((code (char-code (schar string i))))
     2392         (declare (type (mod #x110000) code))
     2393         (setf (%get-unsigned-long pointer idx) code)
     2394         (incf idx 4)))))
     2395  :memory-decode-function
     2396  (nfunction
     2397   native-ucs-4-memory-decode
     2398   (lambda (pointer noctets idx string)
     2399     (declare (fixnum noctets idx))
     2400     (do* ((i 0 (1+ i))
     2401           (index idx (+ index 4)))
     2402          ((>= i noctets) index)
     2403       (declare (fixnum i index))
     2404       (let* ((1st-unit (%get-unsigned-long pointer index)))
     2405         (declare (type (unsigned-byte 32) 1st-unit))
     2406         (setf (schar string i) (or (if (< 1st-unit char-code-limit)
     2407                                      (code-char 1st-unit))
     2408                                    #\Replacement_Character))))))
     2409  :octets-in-string-function
     2410  #'ucs-4-octets-in-string
     2411  :length-of-vector-encoding-function
     2412  (nfunction
     2413   native-ucs-4-length-of-vector-encoding
     2414   (lambda (vector start end)
     2415     (declare (ignore vector))
     2416     (do* ((i start (1+ i))
     2417           (j (+ i 4) (+ i 4))
     2418           (nchars 0 (1+ nchars)))
     2419          ((> j end) (values nchars i)))))
     2420  :length-of-memory-encoding-function
     2421  (nfunction
     2422   native-ucs-4-length-of-memory-encoding
     2423   (lambda (pointer noctets start)
     2424     (declare (ignore pointer))
     2425     (values (floor noctets 4) (+ start noctets))))
     2426  :literal-char-code-limit #x110000
     2427  )
     2428
     2429;;; UTF-32/UCS-4, reversed byte order
     2430(define-character-encoding #+big-endian-target :utf32-le #-big-endian-target :utf32-be
     2431  #+little-endian-target
     2432  "A 32-bit, fixed-length encoding in which all Unicode characters
     2433encoded in a single 32-bit word. The encoded data is implicitly big-endian;
     2434byte-order-mark characters are not interpreted on input or prepended
     2435to output."
     2436  #+big-endian-target
     2437  "A 32-bit, fixed-length encoding in which all Unicode characters can
     2438encoded in a single 32-bit word. The encoded data is implicitly
     2439little-endian; byte-order-mark characters are not interpreted on input
     2440or prepended to output."
     2441  :aliases #+big-endian-target '(:ucs-4le) #+little-endian-target '(:ucs-4be)
     2442  :max-units-per-char 1
     2443  :code-unit-size 32
     2444  :native-endianness nil
     2445  :stream-encode-function
     2446  #'ucs-4-stream-encode
     2447  :Stream-decode-function
     2448  #'ucs-4-stream-decode
     2449  :vector-encode-function
     2450  (nfunction
     2451   native-ucs-4-vector-encode
     2452   (lambda (string vector idx start end)
     2453     (declare (type (simple-array (unsigned-byte 8) (*)) vector)
     2454              (fixnum idx))
     2455     (do* ((i start (1+ i)))
     2456          ((>= i end) idx)
     2457       (let* ((char (schar string i))
     2458              (code (char-code char)))
     2459         (declare (type (mod #x110000) code))
     2460         (setf (%reversed-u8-ref-u32 vector idx) code)
     2461         (incf idx 4)))))
     2462  :vector-decode-function
     2463  (nfunction
     2464   native-ucs-4-vector-decode
     2465   (lambda (vector idx noctets string)
     2466     (declare (type (simple-array (unsigned-byte 8) (*)) vector)
     2467              (type index idx))
     2468     (do* ((i 0 (1+ i))
     2469           (end (+ idx noctets))
     2470           (index idx (+ 4 index)))
     2471          ((>= index end) index)
     2472       (declare (fixnum i len index))
     2473       (let* ((code (%reversed-u8-ref-u32 vector index)))
     2474         (declare (type (unsigned-byte 32) code))
     2475         (setf (schar string i)
     2476               (or (if (< code char-code-limit)
     2477                     (code-char code))
     2478                   #\Replacement_Character))))))
     2479  :memory-encode-function
     2480  (nfunction
     2481   native-ucs-4-memory-encode
     2482   (lambda (string pointer idx start end)
     2483     (declare (fixnum idx))
     2484     (do* ((i start (1+ i)))
     2485          ((>= i end) idx)
     2486       (let* ((code (char-code (schar string i))))
     2487         (declare (type (mod #x110000) code))
     2488         (setf (%get-unsigned-long pointer idx) (%swap-u32 code))
     2489         (incf idx 4)))))
     2490  :memory-decode-function
     2491  (nfunction
     2492   reversed-ucs-4-memory-decode
     2493   (lambda (pointer noctets idx string)
     2494     (declare (fixnum noctets idx))
     2495     (do* ((i 0 (1+ i))
     2496           (index idx (+ index 4)))
     2497          ((>= i noctets) index)
     2498       (declare (fixnum i index))
     2499       (let* ((1st-unit (%swap-u32 (%get-unsigned-long pointer index))))
     2500         (declare (type (unsigned-byte 32) 1st-unit))
     2501         (setf (schar string i) (or (if (< 1st-unit char-code-limit)
     2502                                      (code-char 1st-unit))
     2503                                    #\Replacement_Character))))))
     2504
     2505  :octets-in-string-function
     2506  #'ucs-4-octets-in-string
     2507  :length-of-vector-encoding-function
     2508  (nfunction
     2509   reversed-ucs-4-length-of-vector-encoding
     2510   (lambda (vector start end)
     2511     (declare (ignore vector))
     2512     (do* ((i start (1+ i))
     2513           (j (+ i 4) (+ i 4))
     2514           (nchars 0 (1+ nchars)))
     2515          ((> j end) (values nchars i)))))
     2516  :length-of-memory-encoding-function
     2517  (nfunction
     2518   reversed-ucs-4-length-of-memory-encoding
     2519   (lambda (pointer noctets start)
     2520     (declare (ignore pointer))
     2521     (values (floor noctets 4) (+ start noctets))))
     2522  :literal-char-code-limit #x110000
     2523  )
     2524
     2525(define-character-encoding :utf-32
     2526    "A 32-bit, fixed-length encoding in which all Unicode characters can be encoded in a single 32-bit word.  The endianness of the encoded data is indicated by the endianness of a byte-order-mark character (#\u+feff) prepended to the data; in the absence of such a character on input, input data is assumed to be in big-endian order.  Output is written in native byte order with a leading byte-order mark."
     2527   
     2528  :aliases '(:utf-4)
     2529  :max-units-per-char 1
     2530  :code-unit-size 32
     2531  :native-endianness t                  ;not necessarily true.
     2532  :stream-encode-function
     2533  #+ucs-4-stream-encode
     2534  :stream-decode-function
     2535  #'ucs-4-stream-decode
     2536  :vector-encode-function
     2537  (nfunction
     2538   utf-32-vector-encode
     2539   (lambda (string vector idx start end)
     2540     (declare (type (simple-array (unsigned-byte 8) (*)) vector)
     2541              (fixnum idx))
     2542     (when (> end start)
     2543       (setf (%native-u8-ref-u32 vector idx) byte-order-mark-char-code)
     2544       (incf idx 4))
     2545     (do* ((i start (1+ i)))
     2546          ((>= i end) idx)
     2547       (let* ((char (schar string i))
     2548              (code (char-code char)))
     2549         (declare (type (mod #x110000) code))
     2550         (setf (%native-u8-ref-u32 vector idx) code)
     2551         (incf idx 4)))))
     2552  :vector-decode-function
     2553  (nfunction
     2554   utf-32-vector-decode
     2555   (lambda (vector idx noctets string)
     2556     (declare (type (simple-array (unsigned-byte 8) (*)) vector)
     2557              (type index idx)
     2558              (fixnum noctets))
     2559     (let* ((swap (if (> noctets 3)
     2560                    (case (%native-u8-ref-u32 vector idx)
     2561                      (#.byte-order-mark-char-code
     2562                       (incf idx 4) (decf noctets 4) nil)
     2563                      (#.swapped-byte-order-mark-char-code
     2564                       (incf idx 4) (decf noctets 4) t)
     2565                       (t #+little-endian-target t)))))
     2566
     2567       (do* ((i 0 (1+ i))
     2568             (end (+ idx noctets))
     2569             (index idx (1+ index)))
     2570            ((>= index end) index)
     2571         (declare (fixnum i len index))
     2572         (let* ((1st-unit (if swap
     2573                            (%reversed-u8-ref-u32 vector index)
     2574                            (%native-u8-ref-u32 vector index))))
     2575             (declare (type (unsigned-byte 32) 1st-unit))
     2576             (setf (schar string i) (or (if (< 1st-unit char-code-limit)
     2577                                          (code-char 1st-unit))
     2578                                        #\Replacement_Character)))))))
     2579  :memory-encode-function
     2580  (nfunction
     2581   utf-32-memory-encode
     2582   (lambda (string pointer idx start end)
     2583     (declare (fixnum idx))
     2584     (when (> end start)
     2585       (setf (%get-unsigned-long pointer idx)
     2586             byte-order-mark-char-code)
     2587       (incf idx 4))
     2588     (do* ((i start (1+ i)))
     2589          ((>= i end) idx)
     2590       (let* ((code (char-code (schar string i))))
     2591         (declare (type (mod #x110000) code))
     2592         (setf (%get-unsigned-long pointer idx) code)
     2593         (incf idx 4)))))
     2594  :memory-decode-function
     2595  (nfunction
     2596   utf-32-memory-decode
     2597   (lambda (pointer noctets idx string)
     2598     (declare (fixnum noctets idx))
     2599     (let* ((swap (when (> noctets 3)
     2600                    (case (%get-unsigned-long pointer idx)
     2601                      (#.byte-order-mark-char-code
     2602                       (incf idx 4)
     2603                       (decf noctets 4)
     2604                       nil)
     2605                      (#.swapped-byte-order-mark-char-code
     2606                       (incf idx 4)
     2607                       (decf noctets 4)
     2608                       t)
     2609                      (t #+little-endian-target t)))))
     2610       (do* ((i 0 (1+ i))
     2611           (index idx (+ index 2)))
     2612          ((>= i noctets) index)
     2613       (declare (fixnum i index))
     2614       (let* ((1st-unit (%get-unsigned-long pointer index)))
     2615         (declare (type (unsigned-byte 32) 1st-unit))
     2616         (if swap (setq 1st-unit (%swap-u32 1st-unit)))
     2617         (setf (schar string i) (or (if (< 1st-unit char-code-limit)
     2618                                      (code-char 1st-unit))
     2619                                    #\Replacement_Character)))))))
     2620  :octets-in-string-function
     2621  #'(lambda (&rest args)
     2622      (declare (dynamic-extent args))
     2623      ;; Add four for the BOM.
     2624      (+ 4 (apply #'ucs-4-octets-in-string args)))
     2625  :length-of-vector-encoding-function
     2626  (nfunction
     2627   utf-32-length-of-vector-encoding
     2628   (lambda (vector start end)
     2629     (declare (ignore vector))
     2630     (do* ((i start (1+ i))
     2631           (j (+ i 2) (+ i 2))
     2632           (nchars 0 (1+ nchars)))
     2633          ((> j end) (values nchars i)))))
     2634  :length-of-memory-encoding-function
     2635  (nfunction
     2636   utf-32-length-of-memory-encoding
     2637   (lambda (pointer noctets start)
     2638     (when (> noctets 1)
     2639       (case (%get-unsigned-long pointer )
     2640         (#.byte-order-mark-char-code
     2641          (incf start 4)
     2642          (decf noctets 4))
     2643         (#.swapped-byte-order-mark-char-code
     2644          (incf start 4)
     2645          (decf noctets 4))))
     2646     (values (floor noctets 4) (+ start noctets))))
     2647  :literal-char-code-limit #x110000
     2648  :use-byte-order-mark
     2649  #+big-endian-target :utf-32le
     2650  #+little-endian-target :utf-32be
    22412651  )
    22422652
Note: See TracChangeset for help on using the changeset viewer.