' Deal with file names.
c$ = "doc.txt"
PRINT "No input file specified. Assuming " + c$
+ "." file$ = c$
fileout$
= LEFT$(file$
, j
- 1) + "-out.csv" fileout$ = file$ + "-out.csv"
HashTableSize = 300007 ' Best to use a big prime number. Bigger examples are 611953 and 1014729.
DIM SHARED LB
AS STRING ' Make sure that bcracketing sequences do not appear in the data source, otherwise use (a) special character(s). LB = "{/"
RB = "/}"
DIM SHARED EnglishDictionary
(HashTableSize
) AS STRING ' Hash table size does not need to equal the size of the source dictionary itself.
' Analysis/debuggig tool:
' Prepare a plot of hash values versus frequency of occurence (histogram).
' Open Oxford English Dictionary and load all information into a hash table (the array EnglishDictionary).
PRINT "Loading dictionary..." i = 0
IF k
> 0 AND j
= 0 THEN j
= k
' Handles (the) word(s like) "To , " i = i + 1
b$
= LEFT$(a$
, j
- 1) ' Extract the base word. IF RIGHT$(b$
, 1) = "1" THEN b$
= LEFT$(b$
, LEN(b$
) - 1) ' Remove trailing `1' from words with multiple definitions. b$
= LCASE$(b$
) ' Convert to lowercase. ' Append previous entry with "Usage" information from source.
' The source was originally formatted such that "Usage" parses exactly as a dictionary word.
EnglishDictionary
(d
) = LEFT$(EnglishDictionary
(d
), LEN(EnglishDictionary
(d
)) - LEN(RB
)) + "... " + b$
+ ": " + c$
+ RB
d = HashFunc(b$) ' Calculate the hash value (array address) of the word on hand.
' Store the word and definition in the followig format: {/word/}{/definition/}
' Collisions are appended to the right and parsed later during lookup: {/word1/}{/definition1/}{/word2/}{/definition2/}
EnglishDictionary(d) = EnglishDictionary(d) + LB + b$ + RB + LB + c$ + RB
PRINT #2, d
' Record the hash value (analysis/debugging). CLOSE #2 ' Close histogram data file (analysis/debugging).
' Done developing fast lookup tool. Now time for an application.
' Open user-specified text document and make a list of unique words (without counting).
WordList$ = ""
PRINT "Loading user document..." c$ = a$
b$
= LEFT$(c$
, j
- 1) ' Extract the base word. b$
= LCASE$(b$
) ' Convert to lowercase. ' Remove punctuation and stray marks.
TheNaughtyList$
= "`1234567890=~!@#$%^&*()_+[]\{}|;:,.<>/? " + CHR$(34) + CHR$(10) + CHR$(13) ' ignores hyphen and single quote as in: all-that's b$
= ReplaceSubString$
(b$
, MID$(TheNaughtyList$
, k
, 1), "") ' Add to word list in format: {/word1/}{/word2/}{/word3/}...
WordList$ = WordList$ + LB + b$ + RB
b$ = c$
c$ = ""
r$ = ""
b$ = EnglishDictionary(HashFunc(a))
c$ = ""
d$ = ""
c$ = ReturnBetween(b$, LB, RB)
d$ = ReturnBetween(b$, LB, RB)
r$ = a + " " + d$
Lookup$ = r$
ReturnBetween$
= MID$(a
, i
+ f
, j
- (i
+ f
))
sum = HashTableSize
sum
= sum
+ k
* COS(ASC(MID$(a
, k
, 1))) ' Without the linear factor of k, permutations have same hash values. sum
= ABS(VAL(ReplaceSubString$
(STR$(sum
), ".", ""))) sum
= sum
MOD HashTableSize
HashFunc = sum
r$
= LEFT$(a
, j
- 1) + c
+ ReplaceSubString$
(RIGHT$(a
, LEN(a
) - j
+ 1 - LEN(b
)), b
, c
) r$ = a
ReplaceSubString$ = r$