| 1 | ;;; -*- Log: hemlock.log; Package: Spell -*-
|
|---|
| 2 | ;;;
|
|---|
| 3 | ;;; **********************************************************************
|
|---|
| 4 | ;;; This code was written as part of the CMU Common Lisp project at
|
|---|
| 5 | ;;; Carnegie Mellon University, and has been placed in the public domain.
|
|---|
| 6 | ;;;
|
|---|
| 7 | #+CMU (ext:file-comment
|
|---|
| 8 | "$Header$")
|
|---|
| 9 | ;;;
|
|---|
| 10 | ;;; **********************************************************************
|
|---|
| 11 | ;;;
|
|---|
| 12 | ;;; Written by Bill Chiles
|
|---|
| 13 | ;;; Designed by Bill Chiles and Rob Maclachlan
|
|---|
| 14 | ;;;
|
|---|
| 15 |
|
|---|
| 16 | ;;; This is the file that deals with checking and correcting words
|
|---|
| 17 | ;;; using a dictionary read in from a binary file. It has been written
|
|---|
| 18 | ;;; from the basic ideas used in Ispell (on DEC-20's) which originated as
|
|---|
| 19 | ;;; Spell on the ITS machines at MIT. There are flags which have proper
|
|---|
| 20 | ;;; uses defined for them that indicate permissible suffixes to entries.
|
|---|
| 21 | ;;; This allows for about three times as many known words than are actually
|
|---|
| 22 | ;;; stored. When checking the spelling of a word, first it is looked up;
|
|---|
| 23 | ;;; if this fails, then possible roots are looked up, and if any has the
|
|---|
| 24 | ;;; appropriate suffix flag, then the word is considered to be correctly
|
|---|
| 25 | ;;; spelled. For an unknown word, the following rules define "close" words
|
|---|
| 26 | ;;; which are possible corrections:
|
|---|
| 27 | ;;; 1] two adjacent letters are transposed to form a correct spelling;
|
|---|
| 28 | ;;; 2] one letter is changed to form a correct spelling;
|
|---|
| 29 | ;;; 3] one letter is added to form a correct spelling; and/or
|
|---|
| 30 | ;;; 4] one letter is removed to form a correct spelling.
|
|---|
| 31 | ;;; There are two restrictions on the length of a word in regards to its
|
|---|
| 32 | ;;; worthiness of recognition: it must be at least more than two letters
|
|---|
| 33 | ;;; long, and if it has a suffix, then it must be at least four letters
|
|---|
| 34 | ;;; long. More will be said about this when the flags are discussed.
|
|---|
| 35 | ;;; This is implemented in as tense a fashion as possible, and it uses
|
|---|
| 36 | ;;; implementation dependent code from Spell-RT.Lisp to accomplish this.
|
|---|
| 37 | ;;; In general the file I/O and structure accesses encompass the system
|
|---|
| 38 | ;;; dependencies.
|
|---|
| 39 |
|
|---|
| 40 | ;;; This next section will discuss the storage of the dictionary
|
|---|
| 41 | ;;; information. There are three data structures that "are" the
|
|---|
| 42 | ;;; dictionary: a hash table, descriptors table, and a string table. The
|
|---|
| 43 | ;;; hash table is a vector of type '(unsigned-byte 16), whose elements
|
|---|
| 44 | ;;; point into the descriptors table. This is a cyclic hash table to
|
|---|
| 45 | ;;; facilitate dumping it to a file. The descriptors table (also of type
|
|---|
| 46 | ;;; '(unsigned-byte 16)) dedicates three elements to each entry in the
|
|---|
| 47 | ;;; dictionary. Each group of three elements has the following organization
|
|---|
| 48 | ;;; imposed on them:
|
|---|
| 49 | ;;; ----------------------------------------------
|
|---|
| 50 | ;;; | 15..5 hash code | 4..0 length |
|
|---|
| 51 | ;;; ----------------------------------------------
|
|---|
| 52 | ;;; | 15..0 character index |
|
|---|
| 53 | ;;; ----------------------------------------------
|
|---|
| 54 | ;;; | 15..14 character index | 13..0 flags |
|
|---|
| 55 | ;;; ----------------------------------------------
|
|---|
| 56 | ;;; "Length" is the number of characters in the entry; "hash code" is some
|
|---|
| 57 | ;;; eleven bits from the hash code to allow for quicker lookup, "flags"
|
|---|
| 58 | ;;; indicate possible suffixes for the basic entry, and "character index"
|
|---|
| 59 | ;;; is the index of the start of the entry in the string table.
|
|---|
| 60 | ;;; This was originally adopted due to the Perq's word size (can you guess?
|
|---|
| 61 | ;;; 16 bits, that's right). Note the constraint that is placed on the number
|
|---|
| 62 | ;;; of the entries, 21845, because the hash table could not point to more
|
|---|
| 63 | ;;; descriptor units (16 bits of pointer divided by three). Since a value of
|
|---|
| 64 | ;;; zero as a hash table element indicates an empty location, the zeroth element
|
|---|
| 65 | ;;; of the descriptors table must be unused (it cannot be pointed to).
|
|---|
| 66 |
|
|---|
| 67 |
|
|---|
| 68 | ;;; The following is a short discussion with examples of the correct
|
|---|
| 69 | ;;; use of the suffix flags. Let # and @ be symbols that can stand for any
|
|---|
| 70 | ;;; single letter. Upper case letters are constants. "..." stands for any
|
|---|
| 71 | ;;; string of zero or more letters, but note that no word may exist in the
|
|---|
| 72 | ;;; dictionary which is not at least 2 letters long, so, for example, FLY
|
|---|
| 73 | ;;; may not be produced by placing the "Y" flag on "F". Also, no flag is
|
|---|
| 74 | ;;; effective unless the word that it creates is at least 4 letters long,
|
|---|
| 75 | ;;; so, for example, WED may not be produced by placing the "D" flag on
|
|---|
| 76 | ;;; "WE". These flags and examples are from the Ispell documentation with
|
|---|
| 77 | ;;; only slight modifications. Here are the correct uses of the flags:
|
|---|
| 78 | ;;;
|
|---|
| 79 | ;;; "V" flag:
|
|---|
| 80 | ;;; ...E => ...IVE as in create => creative
|
|---|
| 81 | ;;; if # .ne. E, then ...# => ...#IVE as in prevent => preventive
|
|---|
| 82 | ;;;
|
|---|
| 83 | ;;; "N" flag:
|
|---|
| 84 | ;;; ...E => ...ION as in create => creation
|
|---|
| 85 | ;;; ...Y => ...ICATION as in multiply => multiplication
|
|---|
| 86 | ;;; if # .ne. E or Y, then ...# => ...#EN as in fall => fallen
|
|---|
| 87 | ;;;
|
|---|
| 88 | ;;; "X" flag:
|
|---|
| 89 | ;;; ...E => ...IONS as in create => creations
|
|---|
| 90 | ;;; ...Y => ...ICATIONS as in multiply => multiplications
|
|---|
| 91 | ;;; if # .ne. E or Y, ...# => ...#ENS as in weak => weakens
|
|---|
| 92 | ;;;
|
|---|
| 93 | ;;; "H" flag:
|
|---|
| 94 | ;;; ...Y => ...IETH as in twenty => twentieth
|
|---|
| 95 | ;;; if # .ne. Y, then ...# => ...#TH as in hundred => hundredth
|
|---|
| 96 | ;;;
|
|---|
| 97 | ;;; "Y" FLAG:
|
|---|
| 98 | ;;; ... => ...LY as in quick => quickly
|
|---|
| 99 | ;;;
|
|---|
| 100 | ;;; "G" FLAG:
|
|---|
| 101 | ;;; ...E => ...ING as in file => filing
|
|---|
| 102 | ;;; if # .ne. E, then ...# => ...#ING as in cross => crossing
|
|---|
| 103 | ;;;
|
|---|
| 104 | ;;; "J" FLAG"
|
|---|
| 105 | ;;; ...E => ...INGS as in file => filings
|
|---|
| 106 | ;;; if # .ne. E, then ...# => ...#INGS as in cross => crossings
|
|---|
| 107 | ;;;
|
|---|
| 108 | ;;; "D" FLAG:
|
|---|
| 109 | ;;; ...E => ...ED as in create => created
|
|---|
| 110 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 111 | ;;; then ...@Y => ...@IED as in imply => implied
|
|---|
| 112 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 113 | ;;; then ...@# => ...@#ED as in convey => conveyed
|
|---|
| 114 | ;;; if # .ne. E or Y, then ...# => ...#ED as in cross => crossed
|
|---|
| 115 | ;;;
|
|---|
| 116 | ;;; "T" FLAG:
|
|---|
| 117 | ;;; ...E => ...EST as in late => latest
|
|---|
| 118 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 119 | ;;; then ...@Y => ...@IEST as in dirty => dirtiest
|
|---|
| 120 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 121 | ;;; then ...@# => ...@#EST as in gray => grayest
|
|---|
| 122 | ;;; if # .ne. E or Y, then ...# => ...#EST as in small => smallest
|
|---|
| 123 | ;;;
|
|---|
| 124 | ;;; "R" FLAG:
|
|---|
| 125 | ;;; ...E => ...ER as in skate => skater
|
|---|
| 126 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 127 | ;;; then ...@Y => ...@IER as in multiply => multiplier
|
|---|
| 128 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 129 | ;;; then ...@# => ...@#ER as in convey => conveyer
|
|---|
| 130 | ;;; if # .ne. E or Y, then ...# => ...#ER as in build => builder
|
|---|
| 131 | ;;;
|
|---|
| 132 |
|
|---|
| 133 | ;;; "Z FLAG:
|
|---|
| 134 | ;;; ...E => ...ERS as in skate => skaters
|
|---|
| 135 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 136 | ;;; then ...@Y => ...@IERS as in multiply => multipliers
|
|---|
| 137 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 138 | ;;; then ...@# => ...@#ERS as in slay => slayers
|
|---|
| 139 | ;;; if # .ne. E or Y, then ...@# => ...@#ERS as in build => builders
|
|---|
| 140 | ;;;
|
|---|
| 141 | ;;; "S" FLAG:
|
|---|
| 142 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 143 | ;;; then ...@Y => ...@IES as in imply => implies
|
|---|
| 144 | ;;; if # .eq. S, X, Z, or H,
|
|---|
| 145 | ;;; then ...# => ...#ES as in fix => fixes
|
|---|
| 146 | ;;; if # .ne. S, X, Z, H, or Y,
|
|---|
| 147 | ;;; then ...# => ...#S as in bat => bats
|
|---|
| 148 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 149 | ;;; then ...@# => ...@#S as in convey => conveys
|
|---|
| 150 | ;;;
|
|---|
| 151 | ;;; "P" FLAG:
|
|---|
| 152 | ;;; if # .ne. Y, or @ = A, E, I, O, or U,
|
|---|
| 153 | ;;; then ...@# => ...@#NESS as in late => lateness and
|
|---|
| 154 | ;;; gray => grayness
|
|---|
| 155 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 156 | ;;; then ...@Y => ...@INESS as in cloudy => cloudiness
|
|---|
| 157 | ;;;
|
|---|
| 158 | ;;; "M" FLAG:
|
|---|
| 159 | ;;; ... => ...'S as in DOG => DOG'S
|
|---|
| 160 |
|
|---|
| 161 | (in-package "SPELL")
|
|---|
| 162 |
|
|---|
| 163 | |
|---|
| 164 |
|
|---|
| 165 | ;;;; Some Constants
|
|---|
| 166 |
|
|---|
| 167 | (eval-when (:compile-toplevel :execute :load-toplevel)
|
|---|
| 168 |
|
|---|
| 169 | (defconstant spell-deleted-entry #xFFFF)
|
|---|
| 170 |
|
|---|
| 171 | ;;; The next number (using 6 bits) is 63, and that's pretty silly because
|
|---|
| 172 | ;;; "supercalafragalistic" is less than 31 characters long.
|
|---|
| 173 | ;;;
|
|---|
| 174 | (defconstant max-entry-length 31
|
|---|
| 175 | "This the maximum number of characters an entry may have.")
|
|---|
| 176 |
|
|---|
| 177 | ;;; These are the flags (described above), and an entry is allowed a
|
|---|
| 178 | ;;; certain suffix if the appropriate bit is on in the third element of
|
|---|
| 179 | ;;; its descriptor unit (described above).
|
|---|
| 180 | ;;;
|
|---|
| 181 | (defconstant V-mask (ash 1 13))
|
|---|
| 182 | (defconstant N-mask (ash 1 12))
|
|---|
| 183 | (defconstant X-mask (ash 1 11))
|
|---|
| 184 | (defconstant H-mask (ash 1 10))
|
|---|
| 185 | (defconstant Y-mask (ash 1 9))
|
|---|
| 186 | (defconstant G-mask (ash 1 8))
|
|---|
| 187 | (defconstant J-mask (ash 1 7))
|
|---|
| 188 | (defconstant D-mask (ash 1 6))
|
|---|
| 189 | (defconstant T-mask (ash 1 5))
|
|---|
| 190 | (defconstant R-mask (ash 1 4))
|
|---|
| 191 | (defconstant Z-mask (ash 1 3))
|
|---|
| 192 | (defconstant S-mask (ash 1 2))
|
|---|
| 193 | (defconstant P-mask (ash 1 1))
|
|---|
| 194 | (defconstant M-mask 1)
|
|---|
| 195 |
|
|---|
| 196 |
|
|---|
| 197 | ;;; These are the eleven bits of a computed hash that are stored as part of
|
|---|
| 198 | ;;; an entries descriptor unit. The shifting constant is how much the
|
|---|
| 199 | ;;; eleven bits need to be shifted to the right, so they take up the upper
|
|---|
| 200 | ;;; eleven bits of one 16-bit element in a descriptor unit.
|
|---|
| 201 | ;;;
|
|---|
| 202 | (defconstant new-hash-byte (byte 11 13))
|
|---|
| 203 | (defconstant stored-hash-byte (byte 11 5))
|
|---|
| 204 |
|
|---|
| 205 |
|
|---|
| 206 | ;;; The next two constants are used to extract information from an entry's
|
|---|
| 207 | ;;; descriptor unit. The first is the two most significant bits of 18
|
|---|
| 208 | ;;; bits that hold an index into the string table where the entry is
|
|---|
| 209 | ;;; located. If this is confusing, regard the diagram of the descriptor
|
|---|
| 210 | ;;; units above.
|
|---|
| 211 | ;;;
|
|---|
| 212 | (defconstant whole-index-high-byte (byte 2 16))
|
|---|
| 213 | (defconstant stored-index-high-byte (byte 2 14))
|
|---|
| 214 | (defconstant stored-length-byte (byte 5 0))
|
|---|
| 215 |
|
|---|
| 216 |
|
|---|
| 217 | ); eval-when (:compile-toplevel :execute :load-toplevel)
|
|---|
| 218 |
|
|---|
| 219 | |
|---|
| 220 |
|
|---|
| 221 | ;;;; Some Specials and Accesses
|
|---|
| 222 |
|
|---|
| 223 | ;;; *spell-aeiou* will have bits on that represent the capital letters
|
|---|
| 224 | ;;; A, E, I, O, and U to be used to determine if some word roots are legal
|
|---|
| 225 | ;;; for looking up.
|
|---|
| 226 | ;;;
|
|---|
| 227 | (defvar *aeiou*
|
|---|
| 228 | (make-array 128 :element-type 'bit :initial-element 0))
|
|---|
| 229 |
|
|---|
| 230 | (setf (aref *aeiou* (char-code #\A)) 1)
|
|---|
| 231 | (setf (aref *aeiou* (char-code #\E)) 1)
|
|---|
| 232 | (setf (aref *aeiou* (char-code #\I)) 1)
|
|---|
| 233 | (setf (aref *aeiou* (char-code #\O)) 1)
|
|---|
| 234 | (setf (aref *aeiou* (char-code #\U)) 1)
|
|---|
| 235 |
|
|---|
| 236 |
|
|---|
| 237 | ;;; *sxzh* will have bits on that represent the capital letters
|
|---|
| 238 | ;;; S, X, Z, and H to be used to determine if some word roots are legal for
|
|---|
| 239 | ;;; looking up.
|
|---|
| 240 | ;;;
|
|---|
| 241 | (defvar *sxzh*
|
|---|
| 242 | (make-array 128 :element-type 'bit :initial-element 0))
|
|---|
| 243 |
|
|---|
| 244 | (setf (aref *sxzh* (char-code #\S)) 1)
|
|---|
| 245 | (setf (aref *sxzh* (char-code #\X)) 1)
|
|---|
| 246 | (setf (aref *sxzh* (char-code #\Z)) 1)
|
|---|
| 247 | (setf (aref *sxzh* (char-code #\H)) 1)
|
|---|
| 248 |
|
|---|
| 249 |
|
|---|
| 250 | ;;; SET-MEMBER-P will be used with *aeiou* and *sxzh* to determine if a
|
|---|
| 251 | ;;; character is in the specified set.
|
|---|
| 252 | ;;;
|
|---|
| 253 | (eval-when (:compile-toplevel :execute)
|
|---|
| 254 | (defmacro set-member-p (char set)
|
|---|
| 255 | `(not (zerop (the fixnum (aref (the simple-bit-vector ,set)
|
|---|
| 256 | (char-code ,char))))))
|
|---|
| 257 | ) ;eval-when
|
|---|
| 258 |
|
|---|
| 259 |
|
|---|
| 260 | (defvar *dictionary*)
|
|---|
| 261 | (defvar *dictionary-size*)
|
|---|
| 262 | (defvar *descriptors*)
|
|---|
| 263 | (defvar *descriptors-size*)
|
|---|
| 264 | (defvar *string-table*)
|
|---|
| 265 | (defvar *string-table-size*)
|
|---|
| 266 |
|
|---|
| 267 |
|
|---|
| 268 | (eval-when (:compile-toplevel :execute)
|
|---|
| 269 |
|
|---|
| 270 | ;;; DICTIONARY-REF and DESCRIPTOR-REF are references to implementation
|
|---|
| 271 | ;;; dependent structures. *dictionary* and *descriptors* are "system
|
|---|
| 272 | ;;; area pointers" as a result of the way the binary file is opened for
|
|---|
| 273 | ;;; fast access.
|
|---|
| 274 | ;;;
|
|---|
| 275 | (defmacro dictionary-ref (idx)
|
|---|
| 276 | `(sapref *dictionary* ,idx))
|
|---|
| 277 |
|
|---|
| 278 | (defmacro descriptor-ref (idx)
|
|---|
| 279 | `(sapref *descriptors* ,idx))
|
|---|
| 280 |
|
|---|
| 281 |
|
|---|
| 282 | ;;; DESCRIPTOR-STRING-START access an entry's (indicated by idx)
|
|---|
| 283 | ;;; descriptor unit (described at the beginning of the file) and returns
|
|---|
| 284 | ;;; the start index of the entry in the string table. The second of three
|
|---|
| 285 | ;;; words in the descriptor holds the 16 least significant bits of 18, and
|
|---|
| 286 | ;;; the top two bits of the third word are the 2 most significant bits.
|
|---|
| 287 | ;;; These 18 bits are the index into the string table.
|
|---|
| 288 | ;;;
|
|---|
| 289 | (defmacro descriptor-string-start (idx)
|
|---|
| 290 | `(dpb (the fixnum (ldb stored-index-high-byte
|
|---|
| 291 | (the fixnum (descriptor-ref (+ 2 ,idx)))))
|
|---|
| 292 | whole-index-high-byte
|
|---|
| 293 | (the fixnum (descriptor-ref (1+ ,idx)))))
|
|---|
| 294 |
|
|---|
| 295 | ) ;eval-when
|
|---|
| 296 |
|
|---|
| 297 |
|
|---|
| 298 | |
|---|
| 299 |
|
|---|
| 300 | ;;;; Top level Checking/Correcting
|
|---|
| 301 |
|
|---|
| 302 | ;;; CORRECT-SPELLING can be called from top level to check/correct a words
|
|---|
| 303 | ;;; spelling. It is not used for any other purpose.
|
|---|
| 304 | ;;;
|
|---|
| 305 | (defun correct-spelling (word)
|
|---|
| 306 | "Check/correct the spelling of word. Output is done to *standard-output*."
|
|---|
| 307 | (setf word (coerce word 'simple-string))
|
|---|
| 308 | (let ((word (string-upcase (the simple-string word)))
|
|---|
| 309 | (word-len (length (the simple-string word))))
|
|---|
| 310 | (declare (simple-string word) (fixnum word-len))
|
|---|
| 311 | (maybe-read-spell-dictionary)
|
|---|
| 312 | (when (= word-len 1)
|
|---|
| 313 | (error "Single character words are not in the dictionary."))
|
|---|
| 314 | (when (> word-len max-entry-length)
|
|---|
| 315 | (error "~A is too long for the dictionary." word))
|
|---|
| 316 | (multiple-value-bind (idx used-flag-p)
|
|---|
| 317 | (spell-try-word word word-len)
|
|---|
| 318 | (if idx
|
|---|
| 319 | (format t "Found it~:[~; because of ~A~]." used-flag-p
|
|---|
| 320 | (spell-root-word idx))
|
|---|
| 321 | (let ((close-words (spell-collect-close-words word)))
|
|---|
| 322 | (if close-words
|
|---|
| 323 | (format *standard-output*
|
|---|
| 324 | "The possible correct spelling~[~; is~:;s are~]:~
|
|---|
| 325 | ~:*~[~; ~{~A~}~;~{ ~A~^ and~}~:;~
|
|---|
| 326 | ~{~#[~; and~] ~A~^,~}~]."
|
|---|
| 327 | (length close-words)
|
|---|
| 328 | close-words)
|
|---|
| 329 | (format *standard-output* "Word not found.")))))))
|
|---|
| 330 |
|
|---|
| 331 |
|
|---|
| 332 | (defvar *dictionary-read-p* nil)
|
|---|
| 333 |
|
|---|
| 334 | ;;; MAYBE-READ-SPELL-DICTIONARY -- Public
|
|---|
| 335 | ;;;
|
|---|
| 336 | (defun maybe-read-spell-dictionary ()
|
|---|
| 337 | "Read the spelling dictionary if it has not be read already."
|
|---|
| 338 | (unless *dictionary-read-p* (read-dictionary)))
|
|---|
| 339 |
|
|---|
| 340 |
|
|---|
| 341 | (defun spell-root-word (index)
|
|---|
| 342 | "Return the root word corresponding to a dictionary entry at index."
|
|---|
| 343 | (let* ((start (descriptor-string-start index))
|
|---|
| 344 | (len (the fixnum (ldb stored-length-byte
|
|---|
| 345 | (the fixnum (descriptor-ref index)))))
|
|---|
| 346 | (result (make-string len)))
|
|---|
| 347 | (declare (fixnum start len)
|
|---|
| 348 | (simple-string result))
|
|---|
| 349 | (sap-replace result (the system-area-pointer *string-table*)
|
|---|
| 350 | start 0 len)
|
|---|
| 351 | result))
|
|---|
| 352 |
|
|---|
| 353 |
|
|---|
| 354 | (eval-when (:compile-toplevel :execute)
|
|---|
| 355 | (defmacro check-closeness (word word-len closeness-list)
|
|---|
| 356 | `(if (spell-try-word ,word ,word-len)
|
|---|
| 357 | (pushnew (subseq ,word 0 ,word-len) ,closeness-list :test #'string=)))
|
|---|
| 358 | ) ;eval-when
|
|---|
| 359 |
|
|---|
| 360 | (defconstant spell-alphabet
|
|---|
| 361 | (list #\A #\B #\C #\D #\E #\F #\G #\H
|
|---|
| 362 | #\I #\J #\K #\L #\M #\N #\O #\P
|
|---|
| 363 | #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z))
|
|---|
| 364 |
|
|---|
| 365 | ;;; SPELL-COLLECT-CLOSE-WORDS Returns a list of all "close" correctly spelled
|
|---|
| 366 | ;;; words. The definition of "close" is at the beginning of the file, and
|
|---|
| 367 | ;;; there are four sections to this function which collect each of the four
|
|---|
| 368 | ;;; different kinds of close words.
|
|---|
| 369 | ;;;
|
|---|
| 370 | (defun spell-collect-close-words (word)
|
|---|
| 371 | "Returns a list of all \"close\" correctly spelled words. This has the
|
|---|
| 372 | same contraints as SPELL-TRY-WORD, which you have probably already called
|
|---|
| 373 | if you are calling this."
|
|---|
| 374 | (declare (simple-string word))
|
|---|
| 375 | (let* ((word-len (length word))
|
|---|
| 376 | (word-len--1 (1- word-len))
|
|---|
| 377 | (word-len-+1 (1+ word-len))
|
|---|
| 378 | (result ())
|
|---|
| 379 | (correcting-buffer (make-string max-entry-length)))
|
|---|
| 380 | (declare (simple-string correcting-buffer)
|
|---|
| 381 | (fixnum word-len word-len--1 word-len-+1))
|
|---|
| 382 | (replace correcting-buffer word :end1 word-len :end2 word-len)
|
|---|
| 383 |
|
|---|
| 384 | ;; Misspelled because one letter is different.
|
|---|
| 385 | (dotimes (i word-len)
|
|---|
| 386 | (do ((save-char (schar correcting-buffer i))
|
|---|
| 387 | (alphabet spell-alphabet (cdr alphabet)))
|
|---|
| 388 | ((null alphabet)
|
|---|
| 389 | (setf (schar correcting-buffer i) save-char))
|
|---|
| 390 | (setf (schar correcting-buffer i) (car alphabet))
|
|---|
| 391 | (check-closeness correcting-buffer word-len result)))
|
|---|
| 392 |
|
|---|
| 393 | ;; Misspelled because two adjacent letters are transposed.
|
|---|
| 394 | (dotimes (i word-len--1)
|
|---|
| 395 | (rotatef (schar correcting-buffer i) (schar correcting-buffer (1+ i)))
|
|---|
| 396 | (check-closeness correcting-buffer word-len result)
|
|---|
| 397 | (rotatef (schar correcting-buffer i) (schar correcting-buffer (1+ i))))
|
|---|
| 398 |
|
|---|
| 399 | ;; Misspelled because of extraneous letter.
|
|---|
| 400 | (replace correcting-buffer word
|
|---|
| 401 | :start2 1 :end1 word-len--1 :end2 word-len)
|
|---|
| 402 | (check-closeness correcting-buffer word-len--1 result)
|
|---|
| 403 | (dotimes (i word-len--1)
|
|---|
| 404 | (setf (schar correcting-buffer i) (schar word i))
|
|---|
| 405 | (replace correcting-buffer word
|
|---|
| 406 | :start1 (1+ i) :start2 (+ i 2) :end1 word-len--1 :end2 word-len)
|
|---|
| 407 | (check-closeness correcting-buffer word-len--1 result))
|
|---|
| 408 |
|
|---|
| 409 | ;; Misspelled because a letter is missing.
|
|---|
| 410 | (replace correcting-buffer word
|
|---|
| 411 | :start1 1 :end1 word-len-+1 :end2 word-len)
|
|---|
| 412 | (dotimes (i word-len-+1)
|
|---|
| 413 | (do ((alphabet spell-alphabet (cdr alphabet)))
|
|---|
| 414 | ((null alphabet)
|
|---|
| 415 | (rotatef (schar correcting-buffer i)
|
|---|
| 416 | (schar correcting-buffer (1+ i))))
|
|---|
| 417 | (setf (schar correcting-buffer i) (car alphabet))
|
|---|
| 418 | (check-closeness correcting-buffer word-len-+1 result)))
|
|---|
| 419 | result))
|
|---|
| 420 |
|
|---|
| 421 | ;;; SPELL-TRY-WORD The literal 4 is not a constant defined somewhere since it
|
|---|
| 422 | ;;; is part of the definition of the function of looking up words.
|
|---|
| 423 | ;;; TRY-WORD-ENDINGS relies on the guarantee that word-len is at least 4.
|
|---|
| 424 | ;;;
|
|---|
| 425 | (defun spell-try-word (word word-len)
|
|---|
| 426 | "See if the word or an appropriate root is in the spelling dicitionary.
|
|---|
| 427 | Word-len must be inclusively in the range 2..max-entry-length."
|
|---|
| 428 | (or (lookup-entry word word-len)
|
|---|
| 429 | (if (>= (the fixnum word-len) 4)
|
|---|
| 430 | (try-word-endings word word-len))))
|
|---|
| 431 |
|
|---|
| 432 |
|
|---|
| 433 | |
|---|
| 434 |
|
|---|
| 435 | ;;;; Divining Correct Spelling
|
|---|
| 436 |
|
|---|
| 437 | (eval-when (:compile-toplevel :execute)
|
|---|
| 438 |
|
|---|
| 439 | (defmacro setup-root-buffer (word buffer root-len)
|
|---|
| 440 | `(replace ,buffer ,word :end1 ,root-len :end2 ,root-len))
|
|---|
| 441 |
|
|---|
| 442 | (defmacro try-root (word root-len flag-mask)
|
|---|
| 443 | (let ((result (gensym)))
|
|---|
| 444 | `(let ((,result (lookup-entry ,word ,root-len)))
|
|---|
| 445 | (if (and ,result (descriptor-flag ,result ,flag-mask))
|
|---|
| 446 | (return (values ,result ,flag-mask))))))
|
|---|
| 447 |
|
|---|
| 448 | ;;; TRY-MODIFIED-ROOT is used for root words that become truncated
|
|---|
| 449 | ;;; when suffixes are added (e.g., skate => skating). Char-idx is the last
|
|---|
| 450 | ;;; character in the root that has to typically be changed from a #\I to a
|
|---|
| 451 | ;;; #\Y or #\E.
|
|---|
| 452 | ;;;
|
|---|
| 453 | (defmacro try-modified-root (word buffer root-len flag-mask char-idx new-char)
|
|---|
| 454 | (let ((root-word (gensym)))
|
|---|
| 455 | `(let ((,root-word (setup-root-buffer ,word ,buffer ,root-len)))
|
|---|
| 456 | (setf (schar ,root-word ,char-idx) ,new-char)
|
|---|
| 457 | (try-root ,root-word ,root-len ,flag-mask))))
|
|---|
| 458 |
|
|---|
| 459 | ) ;eval-when
|
|---|
| 460 |
|
|---|
| 461 |
|
|---|
| 462 | (defvar *rooting-buffer* (make-string max-entry-length))
|
|---|
| 463 |
|
|---|
| 464 | ;;; TRY-WORD-ENDINGS takes a word that is at least of length 4 and
|
|---|
| 465 | ;;; returns multiple values on success (the index where the word's root's
|
|---|
| 466 | ;;; descriptor starts and :used-flag), otherwise nil. It looks at
|
|---|
| 467 | ;;; characters from the end to the beginning of the word to determine if it
|
|---|
| 468 | ;;; has any known suffixes. This is a VERY simple finite state machine
|
|---|
| 469 | ;;; where all of the suffixes are narrowed down to one possible one in at
|
|---|
| 470 | ;;; most two state changes. This is a PROG form for speed, and in some sense,
|
|---|
| 471 | ;;; readability. The states of the machine are the flag names that denote
|
|---|
| 472 | ;;; suffixes. The two points of branching to labels are the very beginning
|
|---|
| 473 | ;;; of the PROG and the S state. This is a fairly straight forward
|
|---|
| 474 | ;;; implementation of the flag rules presented at the beginning of this
|
|---|
| 475 | ;;; file, with char-idx checks, so we do not index the string below zero.
|
|---|
| 476 |
|
|---|
| 477 | (defun try-word-endings (word word-len)
|
|---|
| 478 | (declare (simple-string word)
|
|---|
| 479 | (fixnum word-len))
|
|---|
| 480 | (prog* ((char-idx (1- word-len))
|
|---|
| 481 | (char (schar word char-idx))
|
|---|
| 482 | (rooting-buffer *rooting-buffer*)
|
|---|
| 483 | flag-mask)
|
|---|
| 484 | (declare (simple-string rooting-buffer)
|
|---|
| 485 | (fixnum char-idx))
|
|---|
| 486 | (case char
|
|---|
| 487 | (#\S (go S)) ;This covers over half of the possible endings
|
|---|
| 488 | ;by branching off the second to last character
|
|---|
| 489 | ;to other flag states that have plural endings.
|
|---|
| 490 | (#\R (setf flag-mask R-mask) ;"er" and "ier"
|
|---|
| 491 | (go D-R-Z-FLAG))
|
|---|
| 492 | (#\T (go T-FLAG)) ;"est" and "iest"
|
|---|
| 493 | (#\D (setf flag-mask D-mask) ;"ed" and "ied"
|
|---|
| 494 | (go D-R-Z-FLAG))
|
|---|
| 495 | (#\H (go H-FLAG)) ;"th" and "ieth"
|
|---|
| 496 | (#\N (setf flag-mask N-mask) ;"ion", "ication", and "en"
|
|---|
| 497 | (go N-X-FLAG))
|
|---|
| 498 | (#\G (setf flag-mask G-mask) ;"ing"
|
|---|
| 499 | (go G-J-FLAG))
|
|---|
| 500 | (#\Y (go Y-FLAG)) ;"ly"
|
|---|
| 501 | (#\E (go V-FLAG))) ;"ive"
|
|---|
| 502 | (return nil)
|
|---|
| 503 |
|
|---|
| 504 | S
|
|---|
| 505 | (setf char-idx (1- char-idx))
|
|---|
| 506 | (setf char (schar word char-idx))
|
|---|
| 507 | (if (char= char #\Y)
|
|---|
| 508 | (if (set-member-p (schar word (1- char-idx)) *aeiou*)
|
|---|
| 509 | (try-root word (1+ char-idx) S-mask)
|
|---|
| 510 | (return nil))
|
|---|
| 511 | (if (not (set-member-p char *sxzh*))
|
|---|
| 512 | (try-root word (1+ char-idx) S-mask)))
|
|---|
| 513 | (case char
|
|---|
| 514 | (#\E (go S-FLAG)) ;"es" and "ies"
|
|---|
| 515 | (#\R (setf flag-mask Z-mask) ;"ers" and "iers"
|
|---|
| 516 | (go D-R-Z-FLAG))
|
|---|
| 517 | (#\G (setf flag-mask J-mask) ;"ings"
|
|---|
| 518 | (go G-J-FLAG))
|
|---|
| 519 | (#\S (go P-FLAG)) ;"ness" and "iness"
|
|---|
| 520 | (#\N (setf flag-mask X-mask) ;"ions", "ications", and "ens"
|
|---|
| 521 | (go N-X-FLAG))
|
|---|
| 522 | (#\' (try-root word char-idx M-mask)))
|
|---|
| 523 | (return nil)
|
|---|
| 524 |
|
|---|
| 525 | S-FLAG
|
|---|
| 526 | (setf char-idx (1- char-idx))
|
|---|
| 527 | (setf char (schar word char-idx))
|
|---|
| 528 | (if (set-member-p char *sxzh*)
|
|---|
| 529 | (try-root word (1+ char-idx) S-mask))
|
|---|
| 530 | (if (and (char= char #\I)
|
|---|
| 531 | (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
|
|---|
| 532 | (try-modified-root word rooting-buffer (1+ char-idx)
|
|---|
| 533 | S-mask char-idx #\Y))
|
|---|
| 534 | (return nil)
|
|---|
| 535 |
|
|---|
| 536 | D-R-Z-FLAG
|
|---|
| 537 | (if (char/= (schar word (1- char-idx)) #\E) (return nil))
|
|---|
| 538 | (try-root word char-idx flag-mask)
|
|---|
| 539 | (if (<= (setf char-idx (- char-idx 2)) 0) (return nil))
|
|---|
| 540 | (setf char (schar word char-idx))
|
|---|
| 541 | (if (char= char #\Y)
|
|---|
| 542 | (if (set-member-p (schar word (1- char-idx)) *aeiou*)
|
|---|
| 543 | (try-root word (1+ char-idx) flag-mask)
|
|---|
| 544 | (return nil))
|
|---|
| 545 | (if (char/= (schar word char-idx) #\E)
|
|---|
| 546 | (try-root word (1+ char-idx) flag-mask)))
|
|---|
| 547 | (if (and (char= char #\I)
|
|---|
| 548 | (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
|
|---|
| 549 | (try-modified-root word rooting-buffer (1+ char-idx)
|
|---|
| 550 | flag-mask char-idx #\Y))
|
|---|
| 551 | (return nil)
|
|---|
| 552 |
|
|---|
| 553 | P-FLAG
|
|---|
| 554 | (if (or (char/= (schar word (1- char-idx)) #\E)
|
|---|
| 555 | (char/= (schar word (- char-idx 2)) #\N))
|
|---|
| 556 | (return nil))
|
|---|
| 557 | (if (<= (setf char-idx (- char-idx 3)) 0) (return nil))
|
|---|
| 558 | (setf char (schar word char-idx))
|
|---|
| 559 | (if (char= char #\Y)
|
|---|
| 560 | (if (set-member-p (schar word (1- char-idx)) *aeiou*)
|
|---|
| 561 | (try-root word (1+ char-idx) P-mask)
|
|---|
| 562 | (return nil)))
|
|---|
| 563 | (try-root word (1+ char-idx) P-mask)
|
|---|
| 564 | (if (and (char= char #\I)
|
|---|
| 565 | (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
|
|---|
| 566 | (try-modified-root word rooting-buffer (1+ char-idx)
|
|---|
| 567 | P-mask char-idx #\Y))
|
|---|
| 568 | (return nil)
|
|---|
| 569 |
|
|---|
| 570 | G-J-FLAG
|
|---|
| 571 | (if (< char-idx 3) (return nil))
|
|---|
| 572 | (setf char-idx (- char-idx 2))
|
|---|
| 573 | (setf char (schar word char-idx))
|
|---|
| 574 | (if (or (char/= char #\I) (char/= (schar word (1+ char-idx)) #\N))
|
|---|
| 575 | (return nil))
|
|---|
| 576 | (if (char/= (schar word (1- char-idx)) #\E)
|
|---|
| 577 | (try-root word char-idx flag-mask))
|
|---|
| 578 | (try-modified-root word rooting-buffer (1+ char-idx)
|
|---|
| 579 | flag-mask char-idx #\E)
|
|---|
| 580 | (return nil)
|
|---|
| 581 |
|
|---|
| 582 | N-X-FLAG
|
|---|
| 583 | (setf char-idx (1- char-idx))
|
|---|
| 584 | (setf char (schar word char-idx))
|
|---|
| 585 | (cond ((char= char #\E)
|
|---|
| 586 | (setf char (schar word (1- char-idx)))
|
|---|
| 587 | (if (and (char/= char #\Y) (char/= char #\E))
|
|---|
| 588 | (try-root word char-idx flag-mask))
|
|---|
| 589 | (return nil))
|
|---|
| 590 | ((char= char #\O)
|
|---|
| 591 | (if (char= (schar word (1- char-idx)) #\I)
|
|---|
| 592 | (try-modified-root word rooting-buffer char-idx
|
|---|
| 593 | flag-mask (1- char-idx) #\E)
|
|---|
| 594 | (return nil))
|
|---|
| 595 | (if (< char-idx 5) (return nil))
|
|---|
| 596 | (if (or (char/= (schar word (- char-idx 2)) #\T)
|
|---|
| 597 | (char/= (schar word (- char-idx 3)) #\A)
|
|---|
| 598 | (char/= (schar word (- char-idx 4)) #\C)
|
|---|
| 599 | (char/= (schar word (- char-idx 5)) #\I))
|
|---|
| 600 | (return nil)
|
|---|
| 601 | (setf char-idx (- char-idx 4)))
|
|---|
| 602 | (try-modified-root word rooting-buffer char-idx
|
|---|
| 603 | flag-mask (1- char-idx) #\Y))
|
|---|
| 604 | (t (return nil)))
|
|---|
| 605 |
|
|---|
| 606 | T-FLAG
|
|---|
| 607 | (if (or (char/= (schar word (1- char-idx)) #\S)
|
|---|
| 608 | (char/= (schar word (- char-idx 2)) #\E))
|
|---|
| 609 | (return nil)
|
|---|
| 610 | (setf char-idx (1- char-idx)))
|
|---|
| 611 | (try-root word char-idx T-mask)
|
|---|
| 612 | (if (<= (setf char-idx (- char-idx 2)) 0) (return nil))
|
|---|
| 613 | (setf char (schar word char-idx))
|
|---|
| 614 | (if (char= char #\Y)
|
|---|
| 615 | (if (set-member-p (schar word (1- char-idx)) *aeiou*)
|
|---|
| 616 | (try-root word (1+ char-idx) T-mask)
|
|---|
| 617 | (return nil))
|
|---|
| 618 | (if (char/= (schar word char-idx) #\E)
|
|---|
| 619 | (try-root word (1+ char-idx) T-mask)))
|
|---|
| 620 | (if (and (char= char #\I)
|
|---|
| 621 | (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
|
|---|
| 622 | (try-modified-root word rooting-buffer (1+ char-idx)
|
|---|
| 623 | T-mask char-idx #\Y))
|
|---|
| 624 | (return nil)
|
|---|
| 625 |
|
|---|
| 626 | H-FLAG
|
|---|
| 627 | (setf char-idx (1- char-idx))
|
|---|
| 628 | (setf char (schar word char-idx))
|
|---|
| 629 | (if (char/= char #\T) (return nil))
|
|---|
| 630 | (if (char/= (schar word (1- char-idx)) #\Y)
|
|---|
| 631 | (try-root word char-idx H-mask))
|
|---|
| 632 | (if (and (char= (schar word (1- char-idx)) #\E)
|
|---|
| 633 | (char= (schar word (- char-idx 2)) #\I))
|
|---|
| 634 | (try-modified-root word rooting-buffer (1- char-idx)
|
|---|
| 635 | H-mask (- char-idx 2) #\Y))
|
|---|
| 636 | (return nil)
|
|---|
| 637 |
|
|---|
| 638 | Y-FLAG
|
|---|
| 639 | (setf char-idx (1- char-idx))
|
|---|
| 640 | (setf char (schar word char-idx))
|
|---|
| 641 | (if (char= char #\L)
|
|---|
| 642 | (try-root word char-idx Y-mask))
|
|---|
| 643 | (return nil)
|
|---|
| 644 |
|
|---|
| 645 | V-FLAG
|
|---|
| 646 | (setf char-idx (- char-idx 2))
|
|---|
| 647 | (setf char (schar word char-idx))
|
|---|
| 648 | (if (or (char/= char #\I) (char/= (schar word (1+ char-idx)) #\V))
|
|---|
| 649 | (return nil))
|
|---|
| 650 | (if (char/= (schar word (1- char-idx)) #\E)
|
|---|
| 651 | (try-root word char-idx V-mask))
|
|---|
| 652 | (try-modified-root word rooting-buffer (1+ char-idx)
|
|---|
| 653 | V-mask char-idx #\E)
|
|---|
| 654 | (return nil)))
|
|---|
| 655 |
|
|---|
| 656 |
|
|---|
| 657 |
|
|---|
| 658 | ;;; DESCRIPTOR-FLAG returns t or nil based on whether the flag is on.
|
|---|
| 659 | ;;; From the diagram at the beginning of the file, we see that the flags
|
|---|
| 660 | ;;; are stored two words off of the first word in the descriptor unit for
|
|---|
| 661 | ;;; an entry.
|
|---|
| 662 | ;;;
|
|---|
| 663 | (defun descriptor-flag (descriptor-start flag-mask)
|
|---|
| 664 | (not (zerop
|
|---|
| 665 | (the fixnum
|
|---|
| 666 | (logand
|
|---|
| 667 | (the fixnum (descriptor-ref (+ 2 (the fixnum descriptor-start))))
|
|---|
| 668 | (the fixnum flag-mask))))))
|
|---|
| 669 |
|
|---|
| 670 | |
|---|
| 671 |
|
|---|
| 672 | ;;;; Looking up Trials
|
|---|
| 673 |
|
|---|
| 674 | (eval-when (:compile-toplevel :execute)
|
|---|
| 675 |
|
|---|
| 676 | ;;; SPELL-STRING= determines if string1 and string2 are the same. Before
|
|---|
| 677 | ;;; it is called it is known that they are both of (- end1 0) length, and
|
|---|
| 678 | ;;; string2 is in system space. This is used in FOUND-ENTRY-P.
|
|---|
| 679 | ;;;
|
|---|
| 680 | (defmacro spell-string= (string1 string2 end1 start2)
|
|---|
| 681 | (let ((idx1 (gensym))
|
|---|
| 682 | (idx2 (gensym)))
|
|---|
| 683 | `(do ((,idx1 0 (1+ ,idx1))
|
|---|
| 684 | (,idx2 ,start2 (1+ ,idx2)))
|
|---|
| 685 | ((= ,idx1 ,end1) t)
|
|---|
| 686 | (declare (fixnum ,idx1 ,idx2))
|
|---|
| 687 | (unless (= (the fixnum (char-code (schar ,string1 ,idx1)))
|
|---|
| 688 | (the fixnum (string-sapref ,string2 ,idx2)))
|
|---|
| 689 | (return nil)))))
|
|---|
| 690 |
|
|---|
| 691 | ;;; FOUND-ENTRY-P determines if entry is what is described at idx.
|
|---|
| 692 | ;;; Hash-and-length is 16 bits that look just like the first word of any
|
|---|
| 693 | ;;; entry's descriptor unit (see diagram at the beginning of the file). If
|
|---|
| 694 | ;;; the word stored at idx and entry have the same hash bits and length,
|
|---|
| 695 | ;;; then we compare characters to see if they are the same.
|
|---|
| 696 | ;;;
|
|---|
| 697 | (defmacro found-entry-p (idx entry entry-len hash-and-length)
|
|---|
| 698 | `(if (= (the fixnum (descriptor-ref ,idx))
|
|---|
| 699 | (the fixnum ,hash-and-length))
|
|---|
| 700 | (spell-string= ,entry *string-table* ,entry-len
|
|---|
| 701 | (descriptor-string-start ,idx))))
|
|---|
| 702 |
|
|---|
| 703 | (defmacro hash2-increment (hash)
|
|---|
| 704 | `(- (the fixnum *dictionary-size*)
|
|---|
| 705 | 2
|
|---|
| 706 | (the fixnum (rem ,hash (- (the fixnum *dictionary-size*) 2)))))
|
|---|
| 707 |
|
|---|
| 708 | (defmacro hash2-loop ((location-var contents-var)
|
|---|
| 709 | loc hash zero-contents-form
|
|---|
| 710 | &optional body-form (for-insertion-p nil))
|
|---|
| 711 | (let ((incr (gensym)))
|
|---|
| 712 | `(let* ((,incr (hash2-increment ,hash))
|
|---|
| 713 | (,location-var ,loc)
|
|---|
| 714 | (,contents-var 0))
|
|---|
| 715 | (declare (fixnum ,location-var ,contents-var ,incr))
|
|---|
| 716 | (loop (setf ,location-var
|
|---|
| 717 | (rem (+ ,location-var ,incr) (the fixnum *dictionary-size*)))
|
|---|
| 718 | (setf ,contents-var (dictionary-ref ,location-var))
|
|---|
| 719 | (if (zerop ,contents-var) (return ,zero-contents-form))
|
|---|
| 720 | ,@(if for-insertion-p
|
|---|
| 721 | `((if (= ,contents-var spell-deleted-entry)
|
|---|
| 722 | (return ,zero-contents-form))))
|
|---|
| 723 | (if (= ,location-var ,loc) (return nil))
|
|---|
| 724 | ,@(if body-form `(,body-form))))))
|
|---|
| 725 |
|
|---|
| 726 | ) ;eval-when
|
|---|
| 727 |
|
|---|
| 728 |
|
|---|
| 729 | ;;; LOOKUP-ENTRY returns the index of the first element of entry's
|
|---|
| 730 | ;;; descriptor unit on success, otherwise nil.
|
|---|
| 731 | ;;;
|
|---|
| 732 | (defun lookup-entry (entry &optional len)
|
|---|
| 733 | (declare (simple-string entry))
|
|---|
| 734 | (let* ((entry-len (or len (length entry)))
|
|---|
| 735 | (hash (string-hash entry entry-len))
|
|---|
| 736 | (hash-and-len (dpb (the fixnum (ldb new-hash-byte hash))
|
|---|
| 737 | stored-hash-byte
|
|---|
| 738 | (the fixnum entry-len)))
|
|---|
| 739 | (loc (rem hash (the fixnum *dictionary-size*)))
|
|---|
| 740 | (loc-contents (dictionary-ref loc)))
|
|---|
| 741 | (declare (fixnum entry-len hash hash-and-len loc))
|
|---|
| 742 | (cond ((zerop loc-contents) nil)
|
|---|
| 743 | ((found-entry-p loc-contents entry entry-len hash-and-len)
|
|---|
| 744 | loc-contents)
|
|---|
| 745 | (t
|
|---|
| 746 | (hash2-loop (loop-loc loc-contents) loc hash
|
|---|
| 747 | nil
|
|---|
| 748 | (if (found-entry-p loc-contents entry entry-len hash-and-len)
|
|---|
| 749 | (return loc-contents)))))))
|
|---|
| 750 | |
|---|
| 751 |
|
|---|
| 752 | ;;;; Binary File Reading
|
|---|
| 753 |
|
|---|
| 754 | (defparameter default-binary-dictionary
|
|---|
| 755 | "library:spell-dictionary.bin")
|
|---|
| 756 |
|
|---|
| 757 | ;;; This is the first thing in a spell binary dictionary file to serve as a
|
|---|
| 758 | ;;; quick check of its proposed contents. This particular number is
|
|---|
| 759 | ;;; "BILLS" on a calculator held upside-down.
|
|---|
| 760 | ;;;
|
|---|
| 761 | (defconstant magic-file-id 57718)
|
|---|
| 762 |
|
|---|
| 763 | ;;; These constants are derived from the order things are written to the
|
|---|
| 764 | ;;; binary dictionary in Spell-Build.Lisp.
|
|---|
| 765 | ;;;
|
|---|
| 766 | (defconstant magic-file-id-loc 0)
|
|---|
| 767 | (defconstant dictionary-size-loc 1)
|
|---|
| 768 | (defconstant descriptors-size-loc 2)
|
|---|
| 769 | (defconstant string-table-size-low-byte-loc 3)
|
|---|
| 770 | (defconstant string-table-size-high-byte-loc 4)
|
|---|
| 771 | (defconstant file-header-bytes 10)
|
|---|
| 772 |
|
|---|
| 773 | ;;; Initially, there are no free descriptor elements and string table bytes,
|
|---|
| 774 | ;;; but when these structures are grown, they are grown by more than that
|
|---|
| 775 | ;;; which is necessary.
|
|---|
| 776 | ;;;
|
|---|
| 777 | (defvar *free-descriptor-elements* 0)
|
|---|
| 778 | (defvar *free-string-table-bytes* 0)
|
|---|
| 779 |
|
|---|
| 780 | ;;; READ-DICTIONARY opens the dictionary and sets up the global structures
|
|---|
| 781 | ;;; manifesting the spelling dictionary. When computing the start addresses
|
|---|
| 782 | ;;; of these structures, we multiply by two since their sizes are in 16bit
|
|---|
| 783 | ;;; lengths while the RT is 8bit-byte addressable.
|
|---|
| 784 | ;;;
|
|---|
| 785 | (defun read-dictionary (&optional (f default-binary-dictionary))
|
|---|
| 786 | (when *dictionary-read-p*
|
|---|
| 787 | (setf *dictionary-read-p* nil)
|
|---|
| 788 | (deallocate-bytes (system-address *dictionary*)
|
|---|
| 789 | (* 2 (the fixnum *dictionary-size*)))
|
|---|
| 790 | (deallocate-bytes (system-address *descriptors*)
|
|---|
| 791 | (* 2 (the fixnum
|
|---|
| 792 | (+ (the fixnum *descriptors-size*)
|
|---|
| 793 | (the fixnum *free-descriptor-elements*)))))
|
|---|
| 794 | (deallocate-bytes (system-address *string-table*)
|
|---|
| 795 | (+ (the fixnum *string-table-size*)
|
|---|
| 796 | (the fixnum *free-string-table-bytes*))))
|
|---|
| 797 | (setf *free-descriptor-elements* 0)
|
|---|
| 798 | (setf *free-string-table-bytes* 0)
|
|---|
| 799 | (let* ((fd (open-dictionary f))
|
|---|
| 800 | (header-info (read-dictionary-structure fd file-header-bytes)))
|
|---|
| 801 | (unless (= (sapref header-info magic-file-id-loc) magic-file-id)
|
|---|
| 802 | (deallocate-bytes (system-address header-info) file-header-bytes)
|
|---|
| 803 | (error "File is not a dictionary: ~S." f))
|
|---|
| 804 | (setf *dictionary-size* (sapref header-info dictionary-size-loc))
|
|---|
| 805 | (setf *descriptors-size* (sapref header-info descriptors-size-loc))
|
|---|
| 806 | (setf *string-table-size* (sapref header-info string-table-size-low-byte-loc))
|
|---|
| 807 | (setf (ldb (byte 12 16) (the fixnum *string-table-size*))
|
|---|
| 808 | (the fixnum (sapref header-info string-table-size-high-byte-loc)))
|
|---|
| 809 | (deallocate-bytes (system-address header-info) file-header-bytes)
|
|---|
| 810 | (setf *dictionary*
|
|---|
| 811 | (read-dictionary-structure fd (* 2 (the fixnum *dictionary-size*))))
|
|---|
| 812 | (setf *descriptors*
|
|---|
| 813 | (read-dictionary-structure fd (* 2 (the fixnum *descriptors-size*))))
|
|---|
| 814 | (setf *string-table* (read-dictionary-structure fd *string-table-size*))
|
|---|
| 815 | (setf *dictionary-read-p* t)
|
|---|
| 816 | (close-dictionary fd)))
|
|---|