| 1 | ;;; -*- Log: hemlock.log; Package: Spell -*-
|
|---|
| 2 | ;;;
|
|---|
| 3 | ;;; **********************************************************************
|
|---|
| 4 | ;;; This code was written as part of the CMU Common Lisp project at
|
|---|
| 5 | ;;; Carnegie Mellon University, and has been placed in the public domain.
|
|---|
| 6 | ;;;
|
|---|
| 7 | ;;; **********************************************************************
|
|---|
| 8 | ;;;
|
|---|
| 9 | ;;; Written by Bill Chiles
|
|---|
| 10 | ;;; Designed by Bill Chiles and Rob Maclachlan
|
|---|
| 11 | ;;;
|
|---|
| 12 |
|
|---|
| 13 | ;;; This file contains code to build a new binary dictionary file from
|
|---|
| 14 | ;;; text in system space. This code relies on implementation dependent
|
|---|
| 15 | ;;; code from spell-rt.lisp. Also, it is expected that spell-corr.lisp
|
|---|
| 16 | ;;; and spell-aug.lisp have been loaded. In order to compile this file,
|
|---|
| 17 | ;;; you must first compile spell-rt, spell-corr.lisp, and spell-aug.lisp.
|
|---|
| 18 |
|
|---|
| 19 | ;;; The text file must be in the following format:
|
|---|
| 20 | ;;; entry1/flag1/flag2/flag3
|
|---|
| 21 | ;;; entry2
|
|---|
| 22 | ;;; entry3/flag1/flag2/flag3/flag4/flag5.
|
|---|
| 23 | ;;; The flags are single letter indicators of legal suffixes for the entry;
|
|---|
| 24 | ;;; the available flags and their correct use may be found at the beginning
|
|---|
| 25 | ;;; of spell-corr.lisp in the Hemlock sources. There must be exactly one
|
|---|
| 26 | ;;; entry per line, and each line must be flushleft.
|
|---|
| 27 |
|
|---|
| 28 |
|
|---|
| 29 | (in-package "SPELL")
|
|---|
| 30 |
|
|---|
| 31 | ;;; An interesting value when building an initial dictionary.
|
|---|
| 32 | (defvar *collision-count* 0)
|
|---|
| 33 |
|
|---|
| 34 | (defvar *new-dictionary*)
|
|---|
| 35 | (defvar *new-descriptors*)
|
|---|
| 36 | (defvar *new-string-table*)
|
|---|
| 37 |
|
|---|
| 38 | (declaim (optimize (debug 3)))
|
|---|
| 39 |
|
|---|
| 40 | |
|---|
| 41 |
|
|---|
| 42 | ;;;; Constants
|
|---|
| 43 |
|
|---|
| 44 | ;;; This is an upper bound estimate of the number of stored entries in the
|
|---|
| 45 | ;;; dictionary. It should not be more than 21,845 because the dictionary
|
|---|
| 46 | ;;; is a vector of type '(unsigned-byte 16), and the descriptors' vector
|
|---|
| 47 | ;;; for the entries uses three '(unsigned-byte 16) elements per descriptor
|
|---|
| 48 | ;;; unit. See the beginning of Spell-Correct.Lisp.
|
|---|
| 49 | ;;;
|
|---|
| 50 | (eval-when (:compile-toplevel :load-toplevel :execute)
|
|---|
| 51 |
|
|---|
| 52 | (defconstant +max-entry-count-estimate+ 15600)
|
|---|
| 53 |
|
|---|
| 54 | (defconstant +new-dictionary-size+ 20011)
|
|---|
| 55 |
|
|---|
| 56 | (defconstant +new-descriptors-size+ (1+ +max-entry-count-estimate+))
|
|---|
| 57 |
|
|---|
| 58 | (defconstant +max-string-table-length+ (* 10 +max-entry-count-estimate+))
|
|---|
| 59 |
|
|---|
| 60 | ); eval-when
|
|---|
| 61 |
|
|---|
| 62 | |
|---|
| 63 |
|
|---|
| 64 | ;;;; Hashing
|
|---|
| 65 |
|
|---|
| 66 | ;;; These hashing macros are different from the ones in Spell-Correct.Lisp
|
|---|
| 67 | ;;; simply because we are using separate space and global specials/constants.
|
|---|
| 68 | ;;; Of course, they should be identical, but it doesn't seem worth cluttering
|
|---|
| 69 | ;;; up Spell-Correct with macro generating macros for this file.
|
|---|
| 70 |
|
|---|
| 71 | ;;; Well, we've made them functions now. we should really clean up the
|
|---|
| 72 | ;;; other macros mentioned above by merging them with these
|
|---|
| 73 |
|
|---|
| 74 | (declaim (inline hash-increment handle-collision get-hash-index))
|
|---|
| 75 | (defun hash-increment (hash size)
|
|---|
| 76 | (- size 2 (rem hash (- size 2))))
|
|---|
| 77 |
|
|---|
| 78 | (defun handle-collision (descriptor-table hash location)
|
|---|
| 79 | (do* ((incr (hash-increment hash +new-dictionary-size+))
|
|---|
| 80 | (collide-location (rem (+ location incr)
|
|---|
| 81 | +new-dictionary-size+)
|
|---|
| 82 | (rem (+ collide-location incr)
|
|---|
| 83 | +new-dictionary-size+)))
|
|---|
| 84 | ;; if we've found our way back to where we started, there are
|
|---|
| 85 | ;; no free slots available. indicate failure.
|
|---|
| 86 | ((= collide-location location) nil)
|
|---|
| 87 | (when (zerop (aref descriptor-table collide-location))
|
|---|
| 88 | (return-from handle-collision collide-location))))
|
|---|
| 89 |
|
|---|
| 90 | (defun get-hash-index (descriptor-table entry entry-length)
|
|---|
| 91 | "Finds a suitable position in DESCRIPTOR-TABLE for ENTRY.
|
|---|
| 92 | Returns NIL if one cannot be located."
|
|---|
| 93 | (let* ((hash (string-hash entry entry-length))
|
|---|
| 94 | (location (rem hash +new-dictionary-size+)))
|
|---|
| 95 | (cond
|
|---|
| 96 | ((not (zerop (aref descriptor-table location)))
|
|---|
| 97 | ;; crud. the desirable spot was already taken. hunt for another
|
|---|
| 98 | (incf *collision-count*)
|
|---|
| 99 | (handle-collision descriptor-table hash location))
|
|---|
| 100 | (t location))))
|
|---|
| 101 |
|
|---|
| 102 | |
|---|
| 103 |
|
|---|
| 104 | ;;;; Build-Dictionary
|
|---|
| 105 |
|
|---|
| 106 | (defun build-dictionary (input output)
|
|---|
| 107 | (let* ((descriptors (make-array +new-descriptors-size+))
|
|---|
| 108 | (string-table (make-string +max-string-table-length+))
|
|---|
| 109 | (descriptor-table (make-array +new-dictionary-size+
|
|---|
| 110 | :element-type '(unsigned-byte 16)))
|
|---|
| 111 | (new-dictionary (make-instance 'dictionary
|
|---|
| 112 | :string-table string-table
|
|---|
| 113 | :descriptors descriptors
|
|---|
| 114 | :descriptor-table descriptor-table)))
|
|---|
| 115 | (write-line "Reading dictionary ...")
|
|---|
| 116 | (force-output)
|
|---|
| 117 | (setf *collision-count* 0)
|
|---|
| 118 | (multiple-value-bind (entry-count string-table-length)
|
|---|
| 119 | (read-initial-dictionary input descriptor-table
|
|---|
| 120 | descriptors string-table)
|
|---|
| 121 | (write-line "Writing dictionary ...")
|
|---|
| 122 | (force-output)
|
|---|
| 123 | (write-dictionary output new-dictionary entry-count string-table-length)
|
|---|
| 124 | (format t "~D entries processed with ~D collisions."
|
|---|
| 125 | entry-count *collision-count*)
|
|---|
| 126 | new-dictionary)))
|
|---|
| 127 |
|
|---|
| 128 | (defun read-initial-dictionary (f dictionary descriptors string-table)
|
|---|
| 129 | (let* ((filename (pathname f))
|
|---|
| 130 | (s (open filename :direction :input :if-does-not-exist nil)))
|
|---|
| 131 | (unless s (error "File ~S does not exist." f))
|
|---|
| 132 | (multiple-value-prog1
|
|---|
| 133 | (let ((descriptor-ptr 1)
|
|---|
| 134 | (string-ptr 0)
|
|---|
| 135 | (entry-count 0))
|
|---|
| 136 | (declare (fixnum descriptor-ptr string-ptr entry-count))
|
|---|
| 137 | (loop (multiple-value-bind (line eofp) (read-line s nil nil)
|
|---|
| 138 | (declare (type (or null simple-string) line))
|
|---|
| 139 | (unless line (return (values entry-count string-ptr)))
|
|---|
| 140 | (incf entry-count)
|
|---|
| 141 | (when (> entry-count +max-entry-count-estimate+)
|
|---|
| 142 | (error "There are too many entries in text file!~%~
|
|---|
| 143 | Please change constants in spell-build.lisp, ~
|
|---|
| 144 | recompile the file, and reload it.~%~
|
|---|
| 145 | Be sure to understand the constraints of permissible ~
|
|---|
| 146 | values."))
|
|---|
| 147 | (let ((flags (or (position #\/ line :test #'char=)
|
|---|
| 148 | (length line))))
|
|---|
| 149 | (declare (fixnum flags))
|
|---|
| 150 | (cond ((> flags +max-entry-length+)
|
|---|
| 151 | (format t "Entry ~s too long." (subseq line 0 flags))
|
|---|
| 152 | (force-output))
|
|---|
| 153 | (t (let ((new-string-ptr (+ string-ptr flags)))
|
|---|
| 154 | (declare (fixnum new-string-ptr))
|
|---|
| 155 | (when (> new-string-ptr +max-string-table-length+)
|
|---|
| 156 | (error "Spell string table overflow!~%~
|
|---|
| 157 | Please change constants in ~
|
|---|
| 158 | spell-build.lisp, recompile the file, ~
|
|---|
| 159 | and reload it.~%~
|
|---|
| 160 | Be sure to understand the constraints ~
|
|---|
| 161 | of permissible values."))
|
|---|
| 162 | (spell-place-entry line flags
|
|---|
| 163 | dictionary descriptors string-table
|
|---|
| 164 | descriptor-ptr string-ptr)
|
|---|
| 165 | (incf descriptor-ptr)
|
|---|
| 166 | (setf string-ptr new-string-ptr)))))
|
|---|
| 167 | (when eofp (return (values entry-count string-ptr))))))
|
|---|
| 168 | (close s))))
|
|---|
| 169 |
|
|---|
| 170 | (defun word-flags (line word-end)
|
|---|
| 171 | (declare (simple-string line) (fixnum word-end))
|
|---|
| 172 | (let ((word-flags 0))
|
|---|
| 173 | (do ((flag (1+ word-end) (+ 2 flag))
|
|---|
| 174 | (line-end (length line)))
|
|---|
| 175 | ((>= flag line-end) word-flags)
|
|---|
| 176 | (declare (fixnum flag line-end))
|
|---|
| 177 | (let ((flag-mask (flag-mask (schar line flag))))
|
|---|
| 178 | (declare (fixnum flag-mask))
|
|---|
| 179 | (if (zerop flag-mask)
|
|---|
| 180 | (format t "Illegal flag ~S on word ~S."
|
|---|
| 181 | (schar line flag) (subseq line 0 word-end))
|
|---|
| 182 | (setf word-flags
|
|---|
| 183 | (logior flag-mask word-flags)))))))
|
|---|
| 184 |
|
|---|
| 185 | (defun spell-place-entry (line word-end dictionary descriptors string-table
|
|---|
| 186 | descriptor-ptr string-ptr)
|
|---|
| 187 | (declare (simple-string line string-table)
|
|---|
| 188 | (fixnum word-end descriptor-ptr string-ptr))
|
|---|
| 189 | (nstring-upcase line :end word-end)
|
|---|
| 190 | (let* ((hash-loc (get-hash-index dictionary line word-end)))
|
|---|
| 191 | (unless hash-loc (error "Dictionary Overflow!"))
|
|---|
| 192 | (setf (aref dictionary hash-loc) descriptor-ptr)
|
|---|
| 193 | (let* ((hash-code (ldb +new-hash-byte+
|
|---|
| 194 | (string-hash line word-end)))
|
|---|
| 195 | (descriptor (make-descriptor :hash-code hash-code
|
|---|
| 196 | :length word-end
|
|---|
| 197 | :string-index string-ptr)))
|
|---|
| 198 | (setf (desc-flags descriptor) (word-flags line word-end)
|
|---|
| 199 | (aref descriptors descriptor-ptr) descriptor)
|
|---|
| 200 | (replace string-table line :start1 string-ptr :end2 word-end))))
|
|---|