| 1 | ;;; -*- Log: hemlock.log; Package: Spell -*-
|
|---|
| 2 | ;;;
|
|---|
| 3 | ;;; **********************************************************************
|
|---|
| 4 | ;;; This code was written as part of the CMU Common Lisp project at
|
|---|
| 5 | ;;; Carnegie Mellon University, and has been placed in the public domain.
|
|---|
| 6 | ;;;
|
|---|
| 7 | ;;; **********************************************************************
|
|---|
| 8 | ;;;
|
|---|
| 9 | ;;; Written by Bill Chiles
|
|---|
| 10 | ;;; Designed by Bill Chiles and Rob Maclachlan
|
|---|
| 11 | ;;;
|
|---|
| 12 |
|
|---|
| 13 | ;;; This is the file that deals with checking and correcting words
|
|---|
| 14 | ;;; using a dictionary read in from a binary file. It has been written
|
|---|
| 15 | ;;; from the basic ideas used in Ispell (on DEC-20's) which originated as
|
|---|
| 16 | ;;; Spell on the ITS machines at MIT. There are flags which have proper
|
|---|
| 17 | ;;; uses defined for them that indicate permissible suffixes to entries.
|
|---|
| 18 | ;;; This allows for about three times as many known words than are actually
|
|---|
| 19 | ;;; stored. When checking the spelling of a word, first it is looked up;
|
|---|
| 20 | ;;; if this fails, then possible roots are looked up, and if any has the
|
|---|
| 21 | ;;; appropriate suffix flag, then the word is considered to be correctly
|
|---|
| 22 | ;;; spelled. For an unknown word, the following rules define "close" words
|
|---|
| 23 | ;;; which are possible corrections:
|
|---|
| 24 | ;;; 1] two adjacent letters are transposed to form a correct spelling;
|
|---|
| 25 | ;;; 2] one letter is changed to form a correct spelling;
|
|---|
| 26 | ;;; 3] one letter is added to form a correct spelling; and/or
|
|---|
| 27 | ;;; 4] one letter is removed to form a correct spelling.
|
|---|
| 28 | ;;; There are two restrictions on the length of a word in regards to its
|
|---|
| 29 | ;;; worthiness of recognition: it must be at least more than two letters
|
|---|
| 30 | ;;; long, and if it has a suffix, then it must be at least four letters
|
|---|
| 31 | ;;; long. More will be said about this when the flags are discussed.
|
|---|
| 32 | ;;; This is implemented in as tense a fashion as possible, and it uses
|
|---|
| 33 | ;;; implementation dependent code from Spell-RT.Lisp to accomplish this.
|
|---|
| 34 | ;;; In general the file I/O and structure accesses encompass the system
|
|---|
| 35 | ;;; dependencies.
|
|---|
| 36 |
|
|---|
| 37 | ;;; This next section will discuss the storage of the dictionary
|
|---|
| 38 | ;;; information. There are three data structures that "are" the
|
|---|
| 39 | ;;; dictionary: a hash table, descriptors table, and a string table. The
|
|---|
| 40 | ;;; hash table is a vector of type '(unsigned-byte 16), whose elements
|
|---|
| 41 | ;;; point into the descriptors table. This is a cyclic hash table to
|
|---|
| 42 | ;;; facilitate dumping it to a file. The descriptors table (also of type
|
|---|
| 43 | ;;; '(unsigned-byte 16)) dedicates three elements to each entry in the
|
|---|
| 44 | ;;; dictionary. Each group of three elements has the following organization
|
|---|
| 45 | ;;; imposed on them:
|
|---|
| 46 | ;;; ----------------------------------------------
|
|---|
| 47 | ;;; | 15..5 hash code | 4..0 length |
|
|---|
| 48 | ;;; ----------------------------------------------
|
|---|
| 49 | ;;; | 15..0 character index |
|
|---|
| 50 | ;;; ----------------------------------------------
|
|---|
| 51 | ;;; | 15..14 character index | 13..0 flags |
|
|---|
| 52 | ;;; ----------------------------------------------
|
|---|
| 53 | ;;; "Length" is the number of characters in the entry; "hash code" is some
|
|---|
| 54 | ;;; eleven bits from the hash code to allow for quicker lookup, "flags"
|
|---|
| 55 | ;;; indicate possible suffixes for the basic entry, and "character index"
|
|---|
| 56 | ;;; is the index of the start of the entry in the string table.
|
|---|
| 57 | ;;; This was originally adopted due to the Perq's word size (can you guess?
|
|---|
| 58 | ;;; 16 bits, that's right). Note the constraint that is placed on the number
|
|---|
| 59 | ;;; of the entries, 21845, because the hash table could not point to more
|
|---|
| 60 | ;;; descriptor units (16 bits of pointer divided by three). Since a value of
|
|---|
| 61 | ;;; zero as a hash table element indicates an empty location, the zeroth element
|
|---|
| 62 | ;;; of the descriptors table must be unused (it cannot be pointed to).
|
|---|
| 63 |
|
|---|
| 64 |
|
|---|
| 65 | ;;; The following is a short discussion with examples of the correct
|
|---|
| 66 | ;;; use of the suffix flags. Let # and @ be symbols that can stand for any
|
|---|
| 67 | ;;; single letter. Upper case letters are constants. "..." stands for any
|
|---|
| 68 | ;;; string of zero or more letters, but note that no word may exist in the
|
|---|
| 69 | ;;; dictionary which is not at least 2 letters long, so, for example, FLY
|
|---|
| 70 | ;;; may not be produced by placing the "Y" flag on "F". Also, no flag is
|
|---|
| 71 | ;;; effective unless the word that it creates is at least 4 letters long,
|
|---|
| 72 | ;;; so, for example, WED may not be produced by placing the "D" flag on
|
|---|
| 73 | ;;; "WE". These flags and examples are from the Ispell documentation with
|
|---|
| 74 | ;;; only slight modifications. Here are the correct uses of the flags:
|
|---|
| 75 | ;;;
|
|---|
| 76 | ;;; "V" flag:
|
|---|
| 77 | ;;; ...E => ...IVE as in create => creative
|
|---|
| 78 | ;;; if # .ne. E, then ...# => ...#IVE as in prevent => preventive
|
|---|
| 79 | ;;;
|
|---|
| 80 | ;;; "N" flag:
|
|---|
| 81 | ;;; ...E => ...ION as in create => creation
|
|---|
| 82 | ;;; ...Y => ...ICATION as in multiply => multiplication
|
|---|
| 83 | ;;; if # .ne. E or Y, then ...# => ...#EN as in fall => fallen
|
|---|
| 84 | ;;;
|
|---|
| 85 | ;;; "X" flag:
|
|---|
| 86 | ;;; ...E => ...IONS as in create => creations
|
|---|
| 87 | ;;; ...Y => ...ICATIONS as in multiply => multiplications
|
|---|
| 88 | ;;; if # .ne. E or Y, ...# => ...#ENS as in weak => weakens
|
|---|
| 89 | ;;;
|
|---|
| 90 | ;;; "H" flag:
|
|---|
| 91 | ;;; ...Y => ...IETH as in twenty => twentieth
|
|---|
| 92 | ;;; if # .ne. Y, then ...# => ...#TH as in hundred => hundredth
|
|---|
| 93 | ;;;
|
|---|
| 94 | ;;; "Y" FLAG:
|
|---|
| 95 | ;;; ... => ...LY as in quick => quickly
|
|---|
| 96 | ;;;
|
|---|
| 97 | ;;; "G" FLAG:
|
|---|
| 98 | ;;; ...E => ...ING as in file => filing
|
|---|
| 99 | ;;; if # .ne. E, then ...# => ...#ING as in cross => crossing
|
|---|
| 100 | ;;;
|
|---|
| 101 | ;;; "J" FLAG"
|
|---|
| 102 | ;;; ...E => ...INGS as in file => filings
|
|---|
| 103 | ;;; if # .ne. E, then ...# => ...#INGS as in cross => crossings
|
|---|
| 104 | ;;;
|
|---|
| 105 | ;;; "D" FLAG:
|
|---|
| 106 | ;;; ...E => ...ED as in create => created
|
|---|
| 107 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 108 | ;;; then ...@Y => ...@IED as in imply => implied
|
|---|
| 109 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 110 | ;;; then ...@# => ...@#ED as in convey => conveyed
|
|---|
| 111 | ;;; if # .ne. E or Y, then ...# => ...#ED as in cross => crossed
|
|---|
| 112 | ;;;
|
|---|
| 113 | ;;; "T" FLAG:
|
|---|
| 114 | ;;; ...E => ...EST as in late => latest
|
|---|
| 115 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 116 | ;;; then ...@Y => ...@IEST as in dirty => dirtiest
|
|---|
| 117 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 118 | ;;; then ...@# => ...@#EST as in gray => grayest
|
|---|
| 119 | ;;; if # .ne. E or Y, then ...# => ...#EST as in small => smallest
|
|---|
| 120 | ;;;
|
|---|
| 121 | ;;; "R" FLAG:
|
|---|
| 122 | ;;; ...E => ...ER as in skate => skater
|
|---|
| 123 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 124 | ;;; then ...@Y => ...@IER as in multiply => multiplier
|
|---|
| 125 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 126 | ;;; then ...@# => ...@#ER as in convey => conveyer
|
|---|
| 127 | ;;; if # .ne. E or Y, then ...# => ...#ER as in build => builder
|
|---|
| 128 | ;;;
|
|---|
| 129 |
|
|---|
| 130 | ;;; "Z FLAG:
|
|---|
| 131 | ;;; ...E => ...ERS as in skate => skaters
|
|---|
| 132 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 133 | ;;; then ...@Y => ...@IERS as in multiply => multipliers
|
|---|
| 134 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 135 | ;;; then ...@# => ...@#ERS as in slay => slayers
|
|---|
| 136 | ;;; if # .ne. E or Y, then ...@# => ...@#ERS as in build => builders
|
|---|
| 137 | ;;;
|
|---|
| 138 | ;;; "S" FLAG:
|
|---|
| 139 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 140 | ;;; then ...@Y => ...@IES as in imply => implies
|
|---|
| 141 | ;;; if # .eq. S, X, Z, or H,
|
|---|
| 142 | ;;; then ...# => ...#ES as in fix => fixes
|
|---|
| 143 | ;;; if # .ne. S, X, Z, H, or Y,
|
|---|
| 144 | ;;; then ...# => ...#S as in bat => bats
|
|---|
| 145 | ;;; if # = Y, and @ = A, E, I, O, or U,
|
|---|
| 146 | ;;; then ...@# => ...@#S as in convey => conveys
|
|---|
| 147 | ;;;
|
|---|
| 148 | ;;; "P" FLAG:
|
|---|
| 149 | ;;; if # .ne. Y, or @ = A, E, I, O, or U,
|
|---|
| 150 | ;;; then ...@# => ...@#NESS as in late => lateness and
|
|---|
| 151 | ;;; gray => grayness
|
|---|
| 152 | ;;; if @ .ne. A, E, I, O, or U,
|
|---|
| 153 | ;;; then ...@Y => ...@INESS as in cloudy => cloudiness
|
|---|
| 154 | ;;;
|
|---|
| 155 | ;;; "M" FLAG:
|
|---|
| 156 | ;;; ... => ...'S as in DOG => DOG'S
|
|---|
| 157 |
|
|---|
| 158 | (in-package "SPELL")
|
|---|
| 159 |
|
|---|
| 160 | |
|---|
| 161 |
|
|---|
| 162 | ;;;; Some Specials and Accesses
|
|---|
| 163 |
|
|---|
| 164 | ;;; *spell-aeiou* will have bits on that represent the capital letters
|
|---|
| 165 | ;;; A, E, I, O, and U to be used to determine if some word roots are legal
|
|---|
| 166 | ;;; for looking up.
|
|---|
| 167 | ;;;
|
|---|
| 168 | (defvar *aeiou*
|
|---|
| 169 | (make-array 128 :element-type 'bit :initial-element 0))
|
|---|
| 170 |
|
|---|
| 171 | (setf (aref *aeiou* (char-code #\A)) 1)
|
|---|
| 172 | (setf (aref *aeiou* (char-code #\E)) 1)
|
|---|
| 173 | (setf (aref *aeiou* (char-code #\I)) 1)
|
|---|
| 174 | (setf (aref *aeiou* (char-code #\O)) 1)
|
|---|
| 175 | (setf (aref *aeiou* (char-code #\U)) 1)
|
|---|
| 176 |
|
|---|
| 177 |
|
|---|
| 178 | ;;; *sxzh* will have bits on that represent the capital letters
|
|---|
| 179 | ;;; S, X, Z, and H to be used to determine if some word roots are legal for
|
|---|
| 180 | ;;; looking up.
|
|---|
| 181 | ;;;
|
|---|
| 182 | (defvar *sxzh*
|
|---|
| 183 | (make-array 128 :element-type 'bit :initial-element 0))
|
|---|
| 184 |
|
|---|
| 185 | (setf (aref *sxzh* (char-code #\S)) 1)
|
|---|
| 186 | (setf (aref *sxzh* (char-code #\X)) 1)
|
|---|
| 187 | (setf (aref *sxzh* (char-code #\Z)) 1)
|
|---|
| 188 | (setf (aref *sxzh* (char-code #\H)) 1)
|
|---|
| 189 |
|
|---|
| 190 |
|
|---|
| 191 | ;;; SET-MEMBER-P will be used with *aeiou* and *sxzh* to determine if a
|
|---|
| 192 | ;;; character is in the specified set.
|
|---|
| 193 | ;;;
|
|---|
| 194 | (declaim (inline set-member-p))
|
|---|
| 195 | (defun set-member-p (char set)
|
|---|
| 196 | (not (zerop (the fixnum (aref (the simple-bit-vector set)
|
|---|
| 197 | (char-code char))))))
|
|---|
| 198 |
|
|---|
| 199 | ;;; DESC-TABLE-REF and DESCRIPTOR-REF are references to implementation
|
|---|
| 200 | ;;; dependent structures.
|
|---|
| 201 | ;;;
|
|---|
| 202 | (declaim (inline desc-table-ref descriptor-ref))
|
|---|
| 203 | (defun desc-table-ref (dictionary index)
|
|---|
| 204 | (aref (descriptor-table dictionary) index))
|
|---|
| 205 | (defun %set-desc-table-ref (dictionary index value)
|
|---|
| 206 | (setf (aref (descriptor-table dictionary) index) value))
|
|---|
| 207 |
|
|---|
| 208 | (defsetf desc-table-ref %set-desc-table-ref)
|
|---|
| 209 |
|
|---|
| 210 | (defun descriptor-ref (dictionary index)
|
|---|
| 211 | (aref (descriptors dictionary) index))
|
|---|
| 212 |
|
|---|
| 213 |
|
|---|
| 214 | ;;; DESCRIPTOR-STRING-START access an entry's (indicated by idx)
|
|---|
| 215 | ;;; descriptor unit (described at the beginning of the file) and returns
|
|---|
| 216 | ;;; the start index of the entry in the string table. The second of three
|
|---|
| 217 | ;;; words in the descriptor holds the 16 least significant bits of 18, and
|
|---|
| 218 | ;;; the top two bits of the third word are the 2 most significant bits.
|
|---|
| 219 | ;;; These 18 bits are the index into the string table.
|
|---|
| 220 | ;;;
|
|---|
| 221 | (defun descriptor-string-start (dictionary index)
|
|---|
| 222 | (desc-string-index (descriptor-ref dictionary index)))
|
|---|
| 223 |
|
|---|
| 224 | |
|---|
| 225 |
|
|---|
| 226 | ;;;; Top level Checking/Correcting
|
|---|
| 227 |
|
|---|
| 228 | ;;; CORRECT-SPELLING can be called from top level to check/correct a words
|
|---|
| 229 | ;;; spelling. It is not used for any other purpose.
|
|---|
| 230 | ;;;
|
|---|
| 231 | (defun correct-spelling (dictionary word)
|
|---|
| 232 | "Check/correct the spelling of word. Output is done to *standard-output*."
|
|---|
| 233 | (setf word (coerce word 'simple-string))
|
|---|
| 234 | (let ((word (string-upcase (the simple-string word)))
|
|---|
| 235 | (word-len (length (the simple-string word))))
|
|---|
| 236 | (declare (simple-string word) (fixnum word-len))
|
|---|
| 237 | (when (= word-len 1)
|
|---|
| 238 | (error "Single character words are not in the dictionary."))
|
|---|
| 239 | (when (> word-len +max-entry-length+)
|
|---|
| 240 | (error "~A is too long for the dictionary." word))
|
|---|
| 241 | (multiple-value-bind (idx used-flag-p)
|
|---|
| 242 | (spell-try-word dictionary word word-len)
|
|---|
| 243 | (if idx
|
|---|
| 244 | (format t "Found it~:[~; because of ~A~]." used-flag-p
|
|---|
| 245 | (spell-root-word dictionary idx))
|
|---|
| 246 | (let ((close-words (spell-collect-close-words dictionary word)))
|
|---|
| 247 | (if close-words
|
|---|
| 248 | (format *standard-output*
|
|---|
| 249 | "The possible correct spelling~[~; is~:;s are~]:~
|
|---|
| 250 | ~:*~[~; ~{~A~}~;~{ ~A~^ and~}~:;~
|
|---|
| 251 | ~{~#[~; and~] ~A~^,~}~]."
|
|---|
| 252 | (length close-words)
|
|---|
| 253 | close-words)
|
|---|
| 254 | (format *standard-output* "Word not found.")))))))
|
|---|
| 255 |
|
|---|
| 256 |
|
|---|
| 257 | (defun spell-root-word (dictionary index)
|
|---|
| 258 | "Return the root word corresponding to a dictionary entry at index."
|
|---|
| 259 | (let* ((descriptor (descriptor-ref dictionary index))
|
|---|
| 260 | (start (desc-string-index descriptor))
|
|---|
| 261 | (len (desc-length descriptor)))
|
|---|
| 262 | (declare (fixnum start len))
|
|---|
| 263 | ;; return a copy
|
|---|
| 264 | (subseq (string-table dictionary) start (+ start len))))
|
|---|
| 265 |
|
|---|
| 266 |
|
|---|
| 267 | ;;; SPELL-COLLECT-CLOSE-WORDS Returns a list of all "close" correctly spelled
|
|---|
| 268 | ;;; words. The definition of "close" is at the beginning of the file, and
|
|---|
| 269 | ;;; there are four sections to this function which collect each of the four
|
|---|
| 270 | ;;; different kinds of close words.
|
|---|
| 271 | ;;;
|
|---|
| 272 | (defun spell-collect-close-words (dictionary word)
|
|---|
| 273 | "Returns a list of all \"close\" correctly spelled words. This has the
|
|---|
| 274 | same contraints as SPELL-TRY-WORD, which you have probably already called
|
|---|
| 275 | if you are calling this."
|
|---|
| 276 | (declare (simple-string word))
|
|---|
| 277 | (let* ((word-len (length word))
|
|---|
| 278 | (word-len--1 (1- word-len))
|
|---|
| 279 | (word-len-+1 (1+ word-len))
|
|---|
| 280 | (result ())
|
|---|
| 281 | (correcting-buffer (make-string +max-entry-length+)))
|
|---|
| 282 | (macrolet ((check-closeness (dictionary word word-len closeness-list)
|
|---|
| 283 | `(when (spell-try-word ,dictionary ,word ,word-len)
|
|---|
| 284 | (pushnew (subseq ,word 0 ,word-len)
|
|---|
| 285 | ,closeness-list :test #'string=))))
|
|---|
| 286 | (declare (simple-string correcting-buffer)
|
|---|
| 287 | (fixnum word-len word-len--1 word-len-+1))
|
|---|
| 288 | (replace correcting-buffer word :end1 word-len :end2 word-len)
|
|---|
| 289 |
|
|---|
| 290 | ;; Misspelled because one letter is different.
|
|---|
| 291 | (dotimes (i word-len)
|
|---|
| 292 | (do ((save-char (schar correcting-buffer i))
|
|---|
| 293 | (alphabet +spell-alphabet+ (cdr alphabet)))
|
|---|
| 294 | ((null alphabet)
|
|---|
| 295 | (setf (schar correcting-buffer i) save-char))
|
|---|
| 296 | (setf (schar correcting-buffer i) (car alphabet))
|
|---|
| 297 | (check-closeness dictionary correcting-buffer word-len result)))
|
|---|
| 298 |
|
|---|
| 299 | ;; Misspelled because two adjacent letters are transposed.
|
|---|
| 300 | (dotimes (i word-len--1)
|
|---|
| 301 | (rotatef (schar correcting-buffer i) (schar correcting-buffer (1+ i)))
|
|---|
| 302 | (check-closeness dictionary correcting-buffer word-len result)
|
|---|
| 303 | (rotatef (schar correcting-buffer i) (schar correcting-buffer (1+ i))))
|
|---|
| 304 |
|
|---|
| 305 | ;; Misspelled because of extraneous letter.
|
|---|
| 306 | (replace correcting-buffer word
|
|---|
| 307 | :start2 1 :end1 word-len--1 :end2 word-len)
|
|---|
| 308 | (check-closeness dictionary correcting-buffer word-len--1 result)
|
|---|
| 309 | (dotimes (i word-len--1)
|
|---|
| 310 | (setf (schar correcting-buffer i) (schar word i))
|
|---|
| 311 | (replace correcting-buffer word
|
|---|
| 312 | :start1 (1+ i) :start2 (+ i 2) :end1 word-len--1 :end2 word-len)
|
|---|
| 313 | (check-closeness dictionary correcting-buffer word-len--1 result))
|
|---|
| 314 |
|
|---|
| 315 | ;; Misspelled because a letter is missing.
|
|---|
| 316 | (replace correcting-buffer word
|
|---|
| 317 | :start1 1 :end1 word-len-+1 :end2 word-len)
|
|---|
| 318 | (dotimes (i word-len-+1)
|
|---|
| 319 | (do ((alphabet +spell-alphabet+ (cdr alphabet)))
|
|---|
| 320 | ((null alphabet)
|
|---|
| 321 | (rotatef (schar correcting-buffer i)
|
|---|
| 322 | (schar correcting-buffer (1+ i))))
|
|---|
| 323 | (setf (schar correcting-buffer i) (car alphabet))
|
|---|
| 324 | (check-closeness dictionary correcting-buffer word-len-+1 result)))
|
|---|
| 325 | result)))
|
|---|
| 326 |
|
|---|
| 327 | ;;; SPELL-TRY-WORD The literal 4 is not a constant defined somewhere since it
|
|---|
| 328 | ;;; is part of the definition of the function of looking up words.
|
|---|
| 329 | ;;; TRY-WORD-ENDINGS relies on the guarantee that word-len is at least 4.
|
|---|
| 330 | ;;;
|
|---|
| 331 | (defun spell-try-word (dictionary word word-len)
|
|---|
| 332 | "See if the word or an appropriate root is in the spelling dicitionary.
|
|---|
| 333 | Word-len must be inclusively in the range 2..max-entry-length."
|
|---|
| 334 | (or (lookup-entry dictionary word word-len)
|
|---|
| 335 | (if (>= (the fixnum word-len) +minimum-try-word-endings-length+)
|
|---|
| 336 | (try-word-endings dictionary word word-len))))
|
|---|
| 337 |
|
|---|
| 338 |
|
|---|
| 339 | |
|---|
| 340 |
|
|---|
| 341 | ;;;; Divining Correct Spelling
|
|---|
| 342 |
|
|---|
| 343 | (eval-when (:compile-toplevel :execute)
|
|---|
| 344 |
|
|---|
| 345 | (defmacro setup-root-buffer (word buffer root-len)
|
|---|
| 346 | `(replace ,buffer ,word :end1 ,root-len :end2 ,root-len))
|
|---|
| 347 |
|
|---|
| 348 | (defmacro try-root (dictionary word root-len flag-mask)
|
|---|
| 349 | (let ((result (gensym)))
|
|---|
| 350 | `(let ((,result (lookup-entry ,dictionary ,word ,root-len)))
|
|---|
| 351 | (if (and ,result (descriptor-flag ,dictionary ,result ,flag-mask))
|
|---|
| 352 | (return (values ,result ,flag-mask))))))
|
|---|
| 353 |
|
|---|
| 354 | ;;; TRY-MODIFIED-ROOT is used for root words that become truncated
|
|---|
| 355 | ;;; when suffixes are added (e.g., skate => skating). Char-idx is the last
|
|---|
| 356 | ;;; character in the root that has to typically be changed from a #\I to a
|
|---|
| 357 | ;;; #\Y or #\E.
|
|---|
| 358 | ;;;
|
|---|
| 359 | (defmacro try-modified-root (dictionary word buffer
|
|---|
| 360 | root-len flag-mask char-idx new-char)
|
|---|
| 361 | (let ((root-word (gensym)))
|
|---|
| 362 | `(let ((,root-word (setup-root-buffer ,word ,buffer ,root-len)))
|
|---|
| 363 | (setf (schar ,root-word ,char-idx) ,new-char)
|
|---|
| 364 | (try-root ,dictionary ,root-word ,root-len ,flag-mask))))
|
|---|
| 365 |
|
|---|
| 366 | ) ;eval-when
|
|---|
| 367 |
|
|---|
| 368 | (defvar *rooting-buffer* (make-string +max-entry-length+))
|
|---|
| 369 |
|
|---|
| 370 | ;;; TRY-WORD-ENDINGS takes a word that is at least of length 4 and
|
|---|
| 371 | ;;; returns multiple values on success (the index where the word's root's
|
|---|
| 372 | ;;; descriptor starts and :used-flag), otherwise nil. It looks at
|
|---|
| 373 | ;;; characters from the end to the beginning of the word to determine if it
|
|---|
| 374 | ;;; has any known suffixes. This is a VERY simple finite state machine
|
|---|
| 375 | ;;; where all of the suffixes are narrowed down to one possible one in at
|
|---|
| 376 | ;;; most two state changes. This is a PROG form for speed, and in some sense,
|
|---|
| 377 | ;;; readability. The states of the machine are the flag names that denote
|
|---|
| 378 | ;;; suffixes. The two points of branching to labels are the very beginning
|
|---|
| 379 | ;;; of the PROG and the S state. This is a fairly straight forward
|
|---|
| 380 | ;;; implementation of the flag rules presented at the beginning of this
|
|---|
| 381 | ;;; file, with char-idx checks, so we do not index the string below zero.
|
|---|
| 382 |
|
|---|
| 383 | (defun try-word-endings (dictionary word word-len)
|
|---|
| 384 | (declare (simple-string word)
|
|---|
| 385 | (fixnum word-len))
|
|---|
| 386 | (prog* ((char-idx (1- word-len))
|
|---|
| 387 | (char (schar word char-idx))
|
|---|
| 388 | (rooting-buffer *rooting-buffer*)
|
|---|
| 389 | flag-mask)
|
|---|
| 390 | (declare (simple-string rooting-buffer)
|
|---|
| 391 | (fixnum char-idx))
|
|---|
| 392 | (case char
|
|---|
| 393 | (#\S (go S)) ;This covers over half of the possible endings
|
|---|
| 394 | ;by branching off the second to last character
|
|---|
| 395 | ;to other flag states that have plural endings.
|
|---|
| 396 | (#\R (setf flag-mask +R-mask+) ;"er" and "ier"
|
|---|
| 397 | (go D-R-Z-FLAG))
|
|---|
| 398 | (#\T (go T-FLAG)) ;"est" and "iest"
|
|---|
| 399 | (#\D (setf flag-mask +D-mask+) ;"ed" and "ied"
|
|---|
| 400 | (go D-R-Z-FLAG))
|
|---|
| 401 | (#\H (go H-FLAG)) ;"th" and "ieth"
|
|---|
| 402 | (#\N (setf flag-mask +N-mask+) ;"ion", "ication", and "en"
|
|---|
| 403 | (go N-X-FLAG))
|
|---|
| 404 | (#\G (setf flag-mask +G-mask+) ;"ing"
|
|---|
| 405 | (go G-J-FLAG))
|
|---|
| 406 | (#\Y (go Y-FLAG)) ;"ly"
|
|---|
| 407 | (#\E (go V-FLAG))) ;"ive"
|
|---|
| 408 | (return nil)
|
|---|
| 409 |
|
|---|
| 410 | S
|
|---|
| 411 | (setf char-idx (1- char-idx))
|
|---|
| 412 | (setf char (schar word char-idx))
|
|---|
| 413 | (if (char= char #\Y)
|
|---|
| 414 | (if (set-member-p (schar word (1- char-idx)) *aeiou*)
|
|---|
| 415 | (try-root dictionary word (1+ char-idx) +S-mask+)
|
|---|
| 416 | (return nil))
|
|---|
| 417 | (if (not (set-member-p char *sxzh*))
|
|---|
| 418 | (try-root dictionary word (1+ char-idx) +S-mask+)))
|
|---|
| 419 | (case char
|
|---|
| 420 | (#\E (go S-FLAG)) ;"es" and "ies"
|
|---|
| 421 | (#\R (setf flag-mask +Z-mask+) ;"ers" and "iers"
|
|---|
| 422 | (go D-R-Z-FLAG))
|
|---|
| 423 | (#\G (setf flag-mask +J-mask+) ;"ings"
|
|---|
| 424 | (go G-J-FLAG))
|
|---|
| 425 | (#\S (go P-FLAG)) ;"ness" and "iness"
|
|---|
| 426 | (#\N (setf flag-mask +X-mask+) ;"ions", "ications", and "ens"
|
|---|
| 427 | (go N-X-FLAG))
|
|---|
| 428 | (#\' (try-root dictionary word char-idx +M-mask+)))
|
|---|
| 429 | (return nil)
|
|---|
| 430 |
|
|---|
| 431 | S-FLAG
|
|---|
| 432 | (setf char-idx (1- char-idx))
|
|---|
| 433 | (setf char (schar word char-idx))
|
|---|
| 434 | (if (set-member-p char *sxzh*)
|
|---|
| 435 | (try-root dictionary word (1+ char-idx) +S-mask+))
|
|---|
| 436 | (if (and (char= char #\I)
|
|---|
| 437 | (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
|
|---|
| 438 | (try-modified-root dictionary word rooting-buffer (1+ char-idx)
|
|---|
| 439 | +S-mask+ char-idx #\Y))
|
|---|
| 440 | (return nil)
|
|---|
| 441 |
|
|---|
| 442 | D-R-Z-FLAG
|
|---|
| 443 | (if (char/= (schar word (1- char-idx)) #\E) (return nil))
|
|---|
| 444 | (try-root dictionary word char-idx flag-mask)
|
|---|
| 445 | (if (<= (setf char-idx (- char-idx 2)) 0) (return nil))
|
|---|
| 446 | (setf char (schar word char-idx))
|
|---|
| 447 | (if (char= char #\Y)
|
|---|
| 448 | (if (set-member-p (schar word (1- char-idx)) *aeiou*)
|
|---|
| 449 | (try-root dictionary word (1+ char-idx) flag-mask)
|
|---|
| 450 | (return nil))
|
|---|
| 451 | (if (char/= (schar word char-idx) #\E)
|
|---|
| 452 | (try-root dictionary word (1+ char-idx) flag-mask)))
|
|---|
| 453 | (if (and (char= char #\I)
|
|---|
| 454 | (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
|
|---|
| 455 | (try-modified-root dictionary word rooting-buffer (1+ char-idx)
|
|---|
| 456 | flag-mask char-idx #\Y))
|
|---|
| 457 | (return nil)
|
|---|
| 458 |
|
|---|
| 459 | P-FLAG
|
|---|
| 460 | (if (or (char/= (schar word (1- char-idx)) #\E)
|
|---|
| 461 | (char/= (schar word (- char-idx 2)) #\N))
|
|---|
| 462 | (return nil))
|
|---|
| 463 | (if (<= (setf char-idx (- char-idx 3)) 0) (return nil))
|
|---|
| 464 | (setf char (schar word char-idx))
|
|---|
| 465 | (if (char= char #\Y)
|
|---|
| 466 | (if (set-member-p (schar word (1- char-idx)) *aeiou*)
|
|---|
| 467 | (try-root dictionary word (1+ char-idx) +P-mask+)
|
|---|
| 468 | (return nil)))
|
|---|
| 469 | (try-root dictionary word (1+ char-idx) +P-mask+)
|
|---|
| 470 | (if (and (char= char #\I)
|
|---|
| 471 | (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
|
|---|
| 472 | (try-modified-root dictionary word rooting-buffer (1+ char-idx)
|
|---|
| 473 | +P-mask+ char-idx #\Y))
|
|---|
| 474 | (return nil)
|
|---|
| 475 |
|
|---|
| 476 | G-J-FLAG
|
|---|
| 477 | (if (< char-idx 3) (return nil))
|
|---|
| 478 | (setf char-idx (- char-idx 2))
|
|---|
| 479 | (setf char (schar word char-idx))
|
|---|
| 480 | (if (or (char/= char #\I) (char/= (schar word (1+ char-idx)) #\N))
|
|---|
| 481 | (return nil))
|
|---|
| 482 | (if (char/= (schar word (1- char-idx)) #\E)
|
|---|
| 483 | (try-root dictionary word char-idx flag-mask))
|
|---|
| 484 | (try-modified-root dictionary word rooting-buffer (1+ char-idx)
|
|---|
| 485 | flag-mask char-idx #\E)
|
|---|
| 486 | (return nil)
|
|---|
| 487 |
|
|---|
| 488 | N-X-FLAG
|
|---|
| 489 | (setf char-idx (1- char-idx))
|
|---|
| 490 | (setf char (schar word char-idx))
|
|---|
| 491 | (cond ((char= char #\E)
|
|---|
| 492 | (setf char (schar word (1- char-idx)))
|
|---|
| 493 | (if (and (char/= char #\Y) (char/= char #\E))
|
|---|
| 494 | (try-root dictionary word char-idx flag-mask))
|
|---|
| 495 | (return nil))
|
|---|
| 496 | ((char= char #\O)
|
|---|
| 497 | (if (char= (schar word (1- char-idx)) #\I)
|
|---|
| 498 | (try-modified-root dictionary word rooting-buffer char-idx
|
|---|
| 499 | flag-mask (1- char-idx) #\E)
|
|---|
| 500 | (return nil))
|
|---|
| 501 | (if (< char-idx 5) (return nil))
|
|---|
| 502 | (if (or (char/= (schar word (- char-idx 2)) #\T)
|
|---|
| 503 | (char/= (schar word (- char-idx 3)) #\A)
|
|---|
| 504 | (char/= (schar word (- char-idx 4)) #\C)
|
|---|
| 505 | (char/= (schar word (- char-idx 5)) #\I))
|
|---|
| 506 | (return nil)
|
|---|
| 507 | (setf char-idx (- char-idx 4)))
|
|---|
| 508 | (try-modified-root dictionary word rooting-buffer char-idx
|
|---|
| 509 | flag-mask (1- char-idx) #\Y))
|
|---|
| 510 | (t (return nil)))
|
|---|
| 511 |
|
|---|
| 512 | T-FLAG
|
|---|
| 513 | (if (or (char/= (schar word (1- char-idx)) #\S)
|
|---|
| 514 | (char/= (schar word (- char-idx 2)) #\E))
|
|---|
| 515 | (return nil)
|
|---|
| 516 | (setf char-idx (1- char-idx)))
|
|---|
| 517 | (try-root dictionary word char-idx +T-mask+)
|
|---|
| 518 | (if (<= (setf char-idx (- char-idx 2)) 0) (return nil))
|
|---|
| 519 | (setf char (schar word char-idx))
|
|---|
| 520 | (if (char= char #\Y)
|
|---|
| 521 | (if (set-member-p (schar word (1- char-idx)) *aeiou*)
|
|---|
| 522 | (try-root dictionary word (1+ char-idx) +T-mask+)
|
|---|
| 523 | (return nil))
|
|---|
| 524 | (if (char/= (schar word char-idx) #\E)
|
|---|
| 525 | (try-root dictionary word (1+ char-idx) +T-mask+)))
|
|---|
| 526 | (if (and (char= char #\I)
|
|---|
| 527 | (not (set-member-p (schar word (1- char-idx)) *aeiou*)))
|
|---|
| 528 | (try-modified-root dictionary word rooting-buffer (1+ char-idx)
|
|---|
| 529 | +T-mask+ char-idx #\Y))
|
|---|
| 530 | (return nil)
|
|---|
| 531 |
|
|---|
| 532 | H-FLAG
|
|---|
| 533 | (setf char-idx (1- char-idx))
|
|---|
| 534 | (setf char (schar word char-idx))
|
|---|
| 535 | (if (char/= char #\T) (return nil))
|
|---|
| 536 | (if (char/= (schar word (1- char-idx)) #\Y)
|
|---|
| 537 | (try-root dictionary word char-idx +H-mask+))
|
|---|
| 538 | (if (and (char= (schar word (1- char-idx)) #\E)
|
|---|
| 539 | (char= (schar word (- char-idx 2)) #\I))
|
|---|
| 540 | (try-modified-root dictionary word rooting-buffer (1- char-idx)
|
|---|
| 541 | +H-mask+ (- char-idx 2) #\Y))
|
|---|
| 542 | (return nil)
|
|---|
| 543 |
|
|---|
| 544 | Y-FLAG
|
|---|
| 545 | (setf char-idx (1- char-idx))
|
|---|
| 546 | (setf char (schar word char-idx))
|
|---|
| 547 | (if (char= char #\L)
|
|---|
| 548 | (try-root dictionary word char-idx +Y-mask+))
|
|---|
| 549 | (return nil)
|
|---|
| 550 |
|
|---|
| 551 | V-FLAG
|
|---|
| 552 | (setf char-idx (- char-idx 2))
|
|---|
| 553 | (setf char (schar word char-idx))
|
|---|
| 554 | (if (or (char/= char #\I) (char/= (schar word (1+ char-idx)) #\V))
|
|---|
| 555 | (return nil))
|
|---|
| 556 | (if (char/= (schar word (1- char-idx)) #\E)
|
|---|
| 557 | (try-root dictionary word char-idx +V-mask+))
|
|---|
| 558 | (try-modified-root dictionary word rooting-buffer (1+ char-idx)
|
|---|
| 559 | +V-mask+ char-idx #\E)
|
|---|
| 560 | (return nil)))
|
|---|
| 561 |
|
|---|
| 562 |
|
|---|
| 563 |
|
|---|
| 564 | ;;; DESCRIPTOR-FLAG returns t or nil based on whether the flag is on.
|
|---|
| 565 | ;;; From the diagram at the beginning of the file, we see that the flags
|
|---|
| 566 | ;;; are stored two words off of the first word in the descriptor unit for
|
|---|
| 567 | ;;; an entry.
|
|---|
| 568 | ;;;
|
|---|
| 569 | ;;; Note: modified for new descriptor scheme
|
|---|
| 570 | (defun descriptor-flag (dictionary descriptor flag-mask)
|
|---|
| 571 | (not (zerop
|
|---|
| 572 | (the fixnum
|
|---|
| 573 | (logand
|
|---|
| 574 | (the fixnum (desc-flags (descriptor-ref dictionary descriptor)))
|
|---|
| 575 | (the fixnum flag-mask))))))
|
|---|
| 576 |
|
|---|
| 577 | |
|---|
| 578 |
|
|---|
| 579 | ;;;; Looking up Trials
|
|---|
| 580 |
|
|---|
| 581 | ;;; these functions used to be macros
|
|---|
| 582 | (declaim (inline spell-string= found-entry-p))
|
|---|
| 583 |
|
|---|
| 584 | (defun spell-string= (string1 string2 end1 start2)
|
|---|
| 585 | (string= string1 string2
|
|---|
| 586 | :end1 end1
|
|---|
| 587 | :start2 start2
|
|---|
| 588 | :end2 (+ start2 end1)))
|
|---|
| 589 |
|
|---|
| 590 | ;;; FOUND-ENTRY-P determines if entry is what is described at idx.
|
|---|
| 591 | ;;; Hash-and-length is 16 bits that look just like the first word of any
|
|---|
| 592 | ;;; entry's descriptor unit (see diagram at the beginning of the file). If
|
|---|
| 593 | ;;; the word stored at idx and entry have the same hash bits and length,
|
|---|
| 594 | ;;; then we compare characters to see if they are the same.
|
|---|
| 595 | ;;;
|
|---|
| 596 | (defun found-entry-p (dictionary idx entry entry-len hash)
|
|---|
| 597 | (let ((desc (descriptor-ref dictionary idx)))
|
|---|
| 598 | (if (and (= (desc-hash-code desc) hash)
|
|---|
| 599 | (= (desc-length desc) entry-len))
|
|---|
| 600 | hash
|
|---|
| 601 | (spell-string= entry (string-table dictionary) entry-len
|
|---|
| 602 | (desc-string-index desc)))))
|
|---|
| 603 |
|
|---|
| 604 | (eval-when (:compile-toplevel :execute)
|
|---|
| 605 |
|
|---|
| 606 | (defmacro hash2-loop ((location-var contents-var)
|
|---|
| 607 | dictionary loc hash zero-contents-form
|
|---|
| 608 | &optional body-form (for-insertion-p nil))
|
|---|
| 609 | (let ((incr (gensym)))
|
|---|
| 610 | `(let* ((,incr (hash-increment ,hash +new-dictionary-size+))
|
|---|
| 611 | (,location-var ,loc)
|
|---|
| 612 | (,contents-var 0))
|
|---|
| 613 | (declare (fixnum ,location-var ,contents-var ,incr))
|
|---|
| 614 | (loop (setf ,location-var
|
|---|
| 615 | (rem (+ ,location-var ,incr) (the fixnum +new-dictionary-size+)))
|
|---|
| 616 | (setf ,contents-var (desc-table-ref ,dictionary ,location-var))
|
|---|
| 617 | (if (zerop ,contents-var) (return ,zero-contents-form))
|
|---|
| 618 | ,@(if for-insertion-p
|
|---|
| 619 | `((if (= ,contents-var spell-deleted-entry)
|
|---|
| 620 | (return ,zero-contents-form))))
|
|---|
| 621 | (if (= ,location-var ,loc) (return nil))
|
|---|
| 622 | ,@(if body-form `(,body-form))))))
|
|---|
| 623 |
|
|---|
| 624 | ) ;eval-when
|
|---|
| 625 |
|
|---|
| 626 |
|
|---|
| 627 | ;;; LOOKUP-ENTRY returns the index of the first element of entry's
|
|---|
| 628 | ;;; descriptor unit on success, otherwise nil.
|
|---|
| 629 | ;;;
|
|---|
| 630 | (defun lookup-entry (dictionary entry &optional length)
|
|---|
| 631 | (declare (simple-string entry))
|
|---|
| 632 | (let* ((entry-length (or length (length entry)))
|
|---|
| 633 | (hash (string-hash entry entry-length))
|
|---|
| 634 | (loc (rem hash (the fixnum +new-dictionary-size+)))
|
|---|
| 635 | (loc-contents (desc-table-ref dictionary loc)))
|
|---|
| 636 | (declare (fixnum entry-length hash loc))
|
|---|
| 637 | (cond ((zerop loc-contents) nil)
|
|---|
| 638 | ((found-entry-p dictionary loc-contents entry entry-length hash)
|
|---|
| 639 | loc-contents)
|
|---|
| 640 | (t
|
|---|
| 641 | (hash2-loop (loop-loc loc-contents)
|
|---|
| 642 | dictionary loc hash
|
|---|
| 643 | nil
|
|---|
| 644 | (if (found-entry-p dictionary loc-contents entry
|
|---|
| 645 | entry-length hash)
|
|---|
| 646 | (return loc-contents)))))))
|
|---|
| 647 |
|
|---|
| 648 |
|
|---|