Changeset 15536 for trunk

Dec 13, 2012, 6:27:55 AM (8 years ago)

Support using the "coding" option in a file's file options line (a
line at the start of a text file that contains name:value pairs
separated by semicolons bracketed by -*- sequences) to determine a
file's character encoding. Specifically:

  • OPEN now allows an external-format of :INFERRED; previously, this was shorthand for an external-format whose line-termination was inferred and whose character encoding was based on *DEFAULT-FILE-CHARACTER-ENCODING*. When an input file whose external-format is specified as :INFERRED is opened, its file options are parsed and the value of the "coding" option is used if such an option is found (and if the value is something that CCL supports.) If a supported "coding" option isn't found, *DEFAULT-FILE-CHARACTER-ENCODING* is used as before.
  • In the Cocoa IDE, the Hemlock command "Ensure File Options Line" (bound to Control-Meta-M by default) ensures that the first line in the current buffer is a file options line and fills in some plausible values for the "Mode", "Package", and "Coding" options. The "Process File Options" command (Control-Meta-m) can be used to process the file options line after it's been edited. (The file options line is always processed when the file is first opened; changes to the "coding" option affect how the file will be saved.)

When a Lisp source file is opened in the IDE editor, the following
character encodings are tried in this order until one of them

  • if the "Open ..." panel was used to open the file and an encoding other than "Automatic" - which is now the default - is selected, that encoding is tried.
  • if a "coding" option is found, that encoding is tried.
  • the value of *DEFAULT-FILE-CHARACTER-ENCODING* is tried.
  • iso-8859-1 is tried. All files can be decoded in iso-8859-1.

This is all supposed to be what Emacs does and I think that it's
pretty close in practice.

A file that caused problems for Paul Krueger a few days ago
because its encoding (ISO-8859-1) wasn't guessed correctly
now has an explicit "coding" option and serves as a test case.

9 edited


  • trunk/source/cocoa-ide/builder-utilities.lisp

    r15350 r15536  
    1 ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: cl-user -*-
     1;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: cl-user; coding:iso-8859-1; -*-
    22;;;; ***********************************************************************
  • trunk/source/cocoa-ide/cocoa-editor.lisp

    r15502 r15536  
    18511851    (when doc
    18521852      (document-invalidate-modeline doc))))
     1854;;; Process a file's "coding" file-option.
     1855(defun hemlock-ext:set-buffer-external-format (buffer string)
     1856  (let* ((ef (ccl::process-file-coding-option string (or (hi::buffer-line-termination buffer) :unix)))
     1857         (encoding-val (nsstring-encoding-for-external-format ef)))
     1858    (cond (encoding-val
     1859           (setf (hi::buffer-line-termination buffer)
     1860                 (external-format-line-termination ef))
     1861           (let* ((doc (hi::buffer-document buffer)))
     1862             (when doc
     1863               (with-slots (encoding) doc
     1864                 (setq encoding encoding-val))))
     1865           (hemlock-ext:invalidate-modeline buffer))
     1866          (t
     1867           (hi:loud-message "Can't parse coding option ~a." string)))))
    18541870(def-cocoa-default *text-pane-margin-width* :float 0.0f0 "width of indented margin around text pane")
    23372353                               :crlf
    23382354                               :cr)
    2339                              :lf))
     2355                             :unix))
    23402356         (hemlock-string (case line-termination
    23412357                           (:crlf (remove #\return string))
    26662682      (hemlock-view frame))))
     2685(defun nsstring-encoding-for-character-encoding-name (name)
     2686  (let* ((string (string name))
     2687         (len (length string)))
     2688    (with-cstrs ((cstr string))
     2689      (with-nsstr (nsstr cstr len)
     2690        (let* ((cf (#_CFStringConvertIANACharSetNameToEncoding nsstr)))
     2691          (if (= cf #$kCFStringEncodingInvalidId)
     2692            (setq cf (#_CFStringGetSystemEncoding)))
     2693          (let* ((ns (#_CFStringConvertEncodingToNSStringEncoding cf)))
     2694            (if (= ns #$kCFStringEncodingInvalidId)
     2695              (#/defaultCStringEncoding ns:ns-string)
     2696              ns)))))))
     2698(defun nsstring-encoding-for-external-format (ef)
     2699  (and ef (nsstring-encoding-for-character-encoding-name
     2700           (ccl:external-format-character-encoding ef))))
    26682702;;; Map *default-file-character-encoding* to an :<NSS>tring<E>ncoding
    26692703(defun get-default-encoding ()
    26722706    (when (and (typep file-encoding 'keyword)
    26732707               (lookup-character-encoding file-encoding))
    2674       (let* ((string (string file-encoding))
    2675              (len (length string)))
    2676         (with-cstrs ((cstr string))
    2677           (with-nsstr (nsstr cstr len)
    2678             (let* ((cf (#_CFStringConvertIANACharSetNameToEncoding nsstr)))
    2679               (if (= cf #$kCFStringEncodingInvalidId)
    2680                 (setq cf (#_CFStringGetSystemEncoding)))
    2681               (let* ((ns (#_CFStringConvertEncodingToNSStringEncoding cf)))
    2682                 (if (= ns #$kCFStringEncodingInvalidId)
    2683                   (#/defaultCStringEncoding ns:ns-string)
    2684                   ns)))))))))
     2708      (nsstring-encoding-for-character-encoding-name file-encoding))))
    26862710(defclass hemlock-document-controller (ns:ns-document-controller)
    28962920    buffer))
     2922;;; Try to read the URL's contents into an NSString which can be
     2923;;; used to initialize the document's Hemlock buffer and related
     2924;;; data structures.  First, try to use the encoding specified
     2925;;; in the last call to the document controller's "open" panel;
     2926;;; if that wasn't specified (was 0, "automatic") or if the string
     2927;;; couldn't be initialized in that encoding, try to use the
     2928;;; encoding specified in the "coding:" file option if that's present.
     2929;;; If that wasn't specified or fails, fall back to the default
     2930;;; encoding (based on CCL:*DEFAULT-FILE-CHARACTER-ENCODING*), and
     2931;;; if that fails, try using :iso-8859-1 (which should always win
     2932;;; but which may misinterpret some characters.)
     2933;;; We should only lose because of a filesystem or permissions
     2934;;; problem or because of a severe low-memory condition or something
     2935;;; equally catastrophic.
     2936;;; We should be careful to zero out the encoding from the last call
     2937;;; to the "open" panel so that leftover value doesn't affect anything
     2938;;; but the next call to this method, and if an encoding selected
     2939;;; explicitly (via the "open" panel or the file-options line) didn't
     2940;;; work, it'd be nice to (somehow) let the user know that.
     2941;;; Whatever encoding works here is remembered as the document's
     2942;;; encoding; that may be overridden when the file-options are parsed.
    28982943(objc:defmethod (#/readFromURL:ofType:error: :<BOOL>)
    28992944    ((self hemlock-editor-document) url type (perror (:* :id)))
    29002945  (declare (ignorable type))
    29012946  (with-callback-context "readFromURL"
    2902     (rlet ((pused-encoding :<NSS>tring<E>ncoding 0))
    2903       (let* ((pathname
    2904               (lisp-string-from-nsstring
    2905                (if (#/isFileURL url)
    2906                  (#/path url)
    2907                  (#/absoluteString url))))
    2908              (buffer (or (hemlock-buffer self)
    2909                          (make-buffer-for-document self pathname)))
    2910              (selected-encoding (slot-value (#/sharedDocumentController (find-class 'hemlock-document-controller)) 'last-encoding))
    2911              (string
    2912               (if (zerop selected-encoding)
    2913                 (#/stringWithContentsOfURL:usedEncoding:error:
    2914                  ns:ns-string
    2915                  url
    2916                  pused-encoding
    2917                  perror)
    2918                 +null-ptr+)))
    2920         (if (%null-ptr-p string)
    2921           (progn
    2922             (if (zerop selected-encoding)
    2923               (setq selected-encoding (or (get-default-encoding) #$NSISOLatin1StringEncoding)))
    2924             (setq string (#/stringWithContentsOfURL:encoding:error:
    2925                           ns:ns-string
    2926                           url
    2927                           selected-encoding
    2928                           perror)))
    2929           (setq selected-encoding (pref pused-encoding :<NSS>tring<E>ncoding)))
     2947    (let* ((data (#/dataWithContentsOfURL:options:error:
     2948                  ns:ns-data url 0 perror))
     2949           (bytes (#/bytes data))
     2950           (length (#/length data))
     2951           (pathname
     2952            (lisp-string-from-nsstring
     2953             (if (#/isFileURL url)
     2954                   (#/path url)
     2955               (#/absoluteString url))))
     2956           (buffer (or (hemlock-buffer self)
     2957                       (make-buffer-for-document self pathname)))
     2958           (document-controller (#/sharedDocumentController (find-class 'hemlock-document-controller)))
     2959           (string +null-ptr+))
     2960      (flet ((try-encoding (encoding)
     2961               (setq string
     2962                     (if (or (null encoding)
     2963                             (zerop encoding))
     2964                       +null-ptr+
     2965                       (make-instance ns:ns-string
     2966                                      :with-bytes-no-copy bytes
     2967                                      :length length
     2968                                      :encoding encoding
     2969                                      :free-when-done nil)))
     2970               (unless (%null-ptr-p string)
     2971                 (setf (slot-value self 'encoding) encoding)
     2972                 t)))
     2973        (unless (try-encoding (with-slots (last-encoding) document-controller
     2974                                (prog1 last-encoding
     2975                                  (setq last-encoding 0))))
     2976          (unless (try-encoding (nsstring-encoding-for-external-format (ccl::external-format-from-octet-buffer bytes length)))
     2977            (unless (try-encoding (get-default-encoding))
     2978              (try-encoding #$NSISOLatin1StringEncoding))))
    29302979        (unless (%null-ptr-p string)
    2931           (with-slots (encoding) self (setq encoding selected-encoding))
    29332980          ;; ** TODO: Argh.  How about we just let hemlock insert it.
    29342981          (let* ((textstorage (slot-value self 'textstorage))
    29482995                 (hi::note-modeline-change buffer)
    29492996                 (setf (hi::buffer-modified buffer) nil))))
    2950           t)))))
     2997            t)))))
    30143061(objc:defmethod (#/prepareSavePanel: :<BOOL>) ((self hemlock-editor-document)
    30153062                                               panel)
    3016   (with-slots (encoding) self
    3017     (let* ((popup (build-encodings-popup (#/sharedDocumentController hemlock-document-controller) encoding)))
     3063  (let* ((popup (build-encodings-popup (#/sharedDocumentController hemlock-document-controller))))
    30183064      (#/setAction: popup (@selector #/noteEncodingChange:))
    30193065      (#/setTarget: popup self)
    3020       (#/setAccessoryView: panel popup)))
     3066      (#/setAccessoryView: panel popup))
    30213067  (#/setExtensionHidden: panel nil)
    30223068  (#/setCanSelectHiddenExtension: panel nil)
    33733419;;; user isn't interested in.)
    33743420(defmethod build-encodings-popup ((self hemlock-document-controller)
    3375                                   &optional (preferred-encoding (get-default-encoding)))
     3421                                  &optional preferred-encoding)
     3422  (declare (ignorable preferred-encoding))
    33763423  (let* ((id-list (supported-nsstring-encodings))
    33773424         (popup (make-instance 'ns:ns-pop-up-button)))
    33823429      (#/addItemWithTitle: popup (nsstring-for-nsstring-encoding id))
    33833430      (#/setTag: (#/lastItem popup) (nsstring-encoding-to-nsinteger id)))
    3384     (when preferred-encoding
    3385       (#/selectItemWithTag: popup (nsstring-encoding-to-nsinteger preferred-encoding)))
     3431    (#/selectItemWithTag: popup (if preferred-encoding (nsstring-encoding-to-nsinteger preferred-encoding) 0))
    33863432    (#/sizeToFit popup)
    33873433    popup))
  • trunk/source/cocoa-ide/hemlock/src/filecoms.lisp

    r14734 r15536  
    6161          (cond
    6262           ((find #\: string :start start :end end)
    63             (do ((opt-start start (1+ semi)) colon semi)
     63            (do ((opt-start start (1+ semi)) colon semi real-semi)
    6464                (nil)
    6565              (setq colon (position #\: string :start opt-start :end end))
    6666              (unless colon
    67                 (loud-message "Missing \":\".  Aborting file options.")
     67                (unless real-semi
     68                  (loud-message "Missing \":\".  Aborting file options."))
    6869                (return-from do-file-options))
    69               (setq semi (or (position #\; string :start colon :end end) end))
     70              (setq semi (or (setq real-semi (position #\; string :start colon :end end)) end))
    7071              (let* ((option (nstring-downcase
    7172                              (trim-subseq string opt-start colon)))
    130131(define-file-option "log" (buffer string)
    131132  (declare (ignore buffer string)))
     134(define-file-option "base" (buffer string)
     135  (declare (ignore buffer string)))
     137(define-file-option "syntax" (buffer string)
     138  (declare (ignore buffer string)))
     140(define-file-option "coding" (buffer string)
     141  (hemlock-ext:set-buffer-external-format buffer string))
    245256                      (variable-value 'hemlock::current-package
    246257                                      :buffer buffer)
    247                       "CL-USER")))
     258                      "CL-USER"))
     259                   (encoding-string (let* ((string (hemlock-ext:buffer-encoding-name buffer))
     260                                           (suffix (case (hi::buffer-line-termination buffer)
     261                                                     (:cr "mac")
     262                                                     (:crlf "dos"))))
     263                                      (if suffix
     264                                        (concatenate 'string string "-" suffix)
     265                                        string))))
    248266              (insert-string
    249267               mark
    250                (format nil ";;; -*- Mode: Lisp; Package: ~a -*-" package-name)))
     268               (format nil ";;; -*- Mode: Lisp; Package: ~a; Coding: ~a; -*-" package-name encoding-string)))
    251269            (insert-string
    252270             mark
  • trunk/source/cocoa-ide/hemlock/src/package.lisp

    r15445 r15536  
    378378   #:top-listener-input-stream
    379379   #:invalidate-modeline
     380   #:set-buffer-external-format
    380381   #:note-buffer-saved
    381382   #:note-buffer-unsaved
  • trunk/source/level-1/l1-files.lisp

    r15379 r15536  
    13321332             (with-open-file (stream file-name
    13331333                                     :element-type 'base-char
    1334                                      :external-format external-format)
     1334                                     :external-format (if (eq external-format :default) :inferred external-format))
    13351335               (load-from-stream stream print)))))))
    13361336  file-name)
  • trunk/source/level-1/l1-streams.lisp

    r15292 r15536  
    27522752    (setf (ioblock-unread-char-function ioblock) (select-stream-untyi-function (ioblock-stream ioblock) :input))
    27532753    (setf (ioblock-decode-literal-code-unit-limit ioblock)
    2754           (if encoding
     2754          (if (and encoding (not (eq encoding :inferred)))
    27552755            (character-encoding-decode-literal-code-unit-limit encoding)
    27562756            256))   
    2757     (if encoding
     2757    (if (and encoding (not (eq encoding :inferred)))
    27582758      (let* ((unit-size (character-encoding-code-unit-size encoding)))
    27592759        (setf (ioblock-peek-char-function ioblock) '%encoded-ioblock-peek-char)
    31623162  (setq line-termination (cdr (assoc line-termination *canonical-line-termination-conventions*)))
    31633163  (when encoding
    3164     (unless (typep encoding 'character-encoding)
    3165       (setq encoding (get-character-encoding encoding)))
    3166     (if (eq encoding (get-character-encoding nil))
    3167       (setq encoding nil)))
     3164    (cond ((and (eq encoding :inferred)
     3165                (typep stream 'file-stream)
     3166                insize))
     3167          (t
     3168           (unless (typep encoding 'character-encoding)
     3169             (setq encoding (get-character-encoding encoding)))
     3170           (if (eq encoding (get-character-encoding nil))
     3171             (setq encoding nil)))))
    31683172  (when sharing
    31693173    (unless (or (eq sharing :private)
    32613265      (setf (ioblock-interactive ioblock) interactive))
    32623266    (setf (stream-ioblock stream) ioblock)
    3263     (when encoding
     3267    (when (and encoding (not (eq encoding :inferred)))
    32643268      (setf (ioblock-native-byte-order ioblock)
    32653269            (character-encoding-native-endianness encoding)))
    3266     (let* ((bom-info (and insize encoding (character-encoding-use-byte-order-mark encoding))))
     3270    (let* ((bom-info (and insize encoding (not (eq encoding :inferred)) (character-encoding-use-byte-order-mark encoding))))
    32673271      (when bom-info
    32683272        (ioblock-check-input-bom ioblock bom-info sharing)))
  • trunk/source/level-1/l1-sysio.lisp

    r15270 r15536  
    3939  (setf (file-ioblock-octet-pos file-ioblock)
    4040        (file-octet-filepos file-ioblock)))
     42(defun infer-character-encoding (file-ioblock)
     43  (let* ((inbuf (file-ioblock-inbuf file-ioblock))
     44         (bufptr (io-buffer-bufptr inbuf))
     45         (n (io-buffer-count inbuf)))
     46    (when (zerop n)
     47      (setq n (or (fd-stream-advance (file-ioblock-stream file-ioblock)
     48                                     file-ioblock
     49                                     t)
     50                  0)))
     51    (setf (stream-external-format (file-ioblock-stream file-ioblock))
     52          (or (external-format-from-octet-buffer bufptr n)
     53              (normalize-external-format :file (file-ioblock-line-termination file-ioblock))))))
    4256(defun infer-line-termination (file-ioblock)
    124138    (setq character-encoding
    125139          (default-character-encoding domain)))
    126   (unless (lookup-character-encoding character-encoding)
     140  (unless (or (lookup-character-encoding character-encoding)
     141              (and (eq domain :file) (eq character-encoding :inferred)))
    127142    (error "~S is not the name of a known character encoding."
    128143           character-encoding))
    143158        ((typep external-format 'external-format)
    144159         external-format)
     160        ((eq external-format :inferred)
     161         (make-external-format :domain domain :character-encoding :inferred :line-termination :inferred))
    145162        ((eq external-format :default)
    146163         (normalize-external-format domain *default-external-format*))
    872889                  (setf (file-ioblock-fileeof ioblock)
    873890                        (ioblock-octets-to-elements ioblock (fd-size fd)))
     891                  (when (and in-p (eq encoding :inferred))
     892                    (infer-character-encoding ioblock))
    874893                  (when (and in-p (eq line-termination :inferred))
    875894                    (infer-line-termination ioblock))
  • trunk/source/level-1/l1-unicode.lisp

    r15535 r15536  
    65316531  (process-file-coding-option (getf (parse-file-options-line line) :coding)
    65326532                              :unix))
     6534(defun external-format-from-octet-buffer (buf count)
     6535  (declare (fixnum count))
     6536  (dotimes (i count)
     6537    (let* ((octet (%get-unsigned-byte buf i)))
     6538      (cond ((or (eql octet (char-code #\linefeed))
     6539                 (eql octet (char-code #\return)))
     6540             (return (external-format-from-file-options (%str-from-ptr buf i))))))))
  • trunk/source/lib/nfcomp.lisp

    r15379 r15536  
    230230           (lexenv (new-lexical-environment defenv))
    231231           (*fasl-compile-time-env* (new-lexical-environment (new-definition-environment)))
    232            (*fcomp-external-format* external-format)
     232           (*fcomp-external-format* (if (eq external-format :default)
     233                                      :inferred
     234                                      external-format))
    233235           (forms nil))
    234236      (let ((current *outstanding-deferred-warnings*) last)
Note: See TracChangeset for help on using the changeset viewer.