Show Posts

Programs / Text Corrector

« on: October 12, 2021, 04:24:29 pm »
Text Corrector corrects writing errors while the user is composing a text. It doesn't require a vocabulary file, instead it learns any language from a given sample text. The algorithm not only extracts the words from that sample, but it also extracts the links between words and analyzes the probabilistic features of the language. Basing on the collected data, typing errors are then modeled as gaussian distributions which mean is the right value.

While the user is typing a new word, in order to predict which word the user intends to write, the algorithm evaluates vocabulary words affinity, which is a weighted mean of three parameters:
- LENGTH(i): the probabilistic distance between the length of the typed word and the length of the i-th vocabulary word
- CONTEXT(i): the estimated frequency that, given one last typed word, the i-th vocabulary word follows
- STRUCTURE(i, j): the probabilistic distance of the j-th letter of the typed word from the j-th position in the i-th vocabulary word

To enhance prediction performance, the algorithm recalibrates these parameters with two further correction mechanisms:
- BIAS: the weight of the LENGTH parameter is reduced while few letters have been typed, since with few letters typed it's probable that length is not definitive (i.e., that more letters will be typed, and length will soon increase)
- BONUS: if the exact typed word exists in the vocabulary, its affinity is brought to the maximum, so that it prevails on all other vocabulary words.

Even though in order to fully work the algorithm needs a long and complete text sample that is representative of the language, the default sample hard-coded in the program can already show a good example of its predictive behavior.
Code: QB64: [Select]
_Title "Text Corrector"
Common Shared L: L = 100000 + 1 ' max number of links + 1
Dim Shared numWords As Integer
Dim Shared numLinks As Integer
Dim Shared numCountedLinks As Integer
Dim Shared numSearchedCountedLinks As Integer
Dim Shared totSearchedCountedLinks As Integer
Dim Shared links(L, 2) As String
Dim Shared countedLinks(L, 3) As String
Dim Shared searchedCountedLinks(L, 3) As String
Dim Shared words(L) As String
Dim Shared wordsProbability(L, 4) As Double ' 1) LENGTH   2) CONTEXT   3) STRUCTURE   4) GLOBAL
Dim Shared GaussTable(0 To 400) As Double: fillGaussTable
 
If _FileExists("sample.txt") Then
    Open "sample.txt" For Input As #1
    Do Until EOF(1)
        Line Input #1, temp$
        sample$ = sample$ + temp$ + " "
    Loop
    Close #1
Else
    sample$ = "Today is a beautiful day, tomorrow I'm going to the park. The park is far from home, so today I will stay at home or maybe I will go to the supermarket. By the way, your house is beautiful and big, I really like it!"
End If
 
lastArchivedWord$ = "."
isFirstTime = -1
lastInputWasPunctuation = 0
learnFromSample (sample$)
Do
    If Not key$ = "" Or isFirstTime Then
        isFirstTime = 0
        bias1 = 0.10 / (2 ^ (1 * Len(newWord$))) ' with few letters context >> length
        bias2 = 0.05 / (2 ^ (1 * Len(newWord$))) ' with few letters structure >> length
        computeWordAffinity lastArchivedWord$, newWord$, 0.15 - bias1 - bias2, 0.30 + bias1, 0.55 + bias2, 1
        refreshTextEditor archivedText$, lastArchivedWord$, newWord$
    End If
 
    key$ = LCase$(InKey$): _Limit 20
    If Not key$ = "" Then
        If isPunctuation(key$) Then
            archivedText$ = archivedText$ + " " + lastArchivedWord$ + " "
            If newWord$ = "" Then
                archivedText$ = archivedText$ + " "
            Else
                archivedText$ = archivedText$ + " " + highestAffinityWord$ + " "
                newWord$ = ""
            End If
            lastArchivedWord$ = key$
            lastInputWasPunctuation = -1
        Else
            Select Case key$
                Case " ":
                    If Not lastInputWasPunctuation Then
                        archivedText$ = archivedText$ + " " + lastArchivedWord$ + " "
                        If isPunctuation(newWord$) Then
                            lastArchivedWord$ = newWord$
                        Else
                            lastArchivedWord$ = highestAffinityWord$
                        End If
                        newWord$ = ""
                    Else
                        lastInputWasPunctuation = 0
                        key$ = ""
                    End If
                Case Chr$(8):
                    If Not Len(newWord$) = 0 Then
                        newWord$ = Left$(newWord$, Len(newWord$) - 1)
                    Else
                        key$ = ""
                    End If
                Case Chr$(13): key$ = ""
                Case Chr$(27): System
                Case Else:
                    newWord$ = newWord$ + key$
                    lastInputWasPunctuation = 0
            End Select
        End If
    End If
Loop
 
 
 
 
Sub refreshTextEditor (archivedText$, lastArchivedWord$, newWord$)
    showComputedWordAffinity
    toprint$ = archivedText$
    If Not isPunctuation(lastArchivedWord$) Then toprint$ = toprint$ + " "
    toprint$ = toprint$ + lastArchivedWord$
    If Not isPunctuation(newWord$) Then toprint$ = toprint$ + " "
    toprint$ = toprint$ + newWord$ + "_"
    showText destandardize$(Right$(toprint$, Len(toprint$) - 2), -1), Len(newWord$), -1
    _Display
End Sub
 
Sub showText (text$, numOfFinalCharsToColor, colorFinalChar)
    Do While Len(text$) > 80 * 10
        text$ = Right$(text$, Len(text$) - 80)
    Loop
    Color 7: Locate 1, 1
    Print "Write something, I'll try to correct you:"
    Color 15: Locate 2, 1
    line$ = ""
    For I = 1 To Len(text$)
        line$ = line$ + Mid$(text$, I, 1)
        If I = Len(text$) - numOfFinalCharsToColor Then Color 13
        If colorFinalChar And I = Len(text$) Then Color 7
        Print Mid$(text$, I, 1);
        If Len(line$) >= 80 Or I = Len(text$) Then line$ = "": Print
    Next I
End Sub
 
Sub showComputedWordAffinity
    best$ = highestAffinityWord$
    Cls: Locate 13, 1: Color 8
    Print "WORD", "LENGTH", "CONTEXT", "STRUCTURE", "GLOBAL"
    For i = 1 To min(numWords, 10)
        If words(i) = best$ Then Color 15
        If Not isSpecial(words(i)) Then
            If Len(words(i)) > 12 Then
                Print Left$(words(i), 11); " ",
            Else
                Print words(i); " ",
            End If
            Print Int(wordsProbability(i, 1) * 100); "%   ", Int(wordsProbability(i, 2) * 100); "%   ", Int(wordsProbability(i, 3) * 100); "%   ", max(Int(wordsProbability(i, 4) * 100), 0); "%"
        End If
        If words(i) = best$ Then Color 8
    Next i
End Sub
 
Function highestAffinityWord$
    For I = 1 To numWords
        If Not isSpecial(words(I)) Then
            highestAffinityWord$ = words(I)
            Exit For
        End If
    Next I
End Function
 
Sub learnFromSample (sample$)
    decomposeIntoLinks (sample$)
    orderLinks
    countLinks
    collectWords
End Sub
 
Sub computeWordAffinity (oldword$, newWord$, A, B, C, BONUS) ' A = length weight, B = context weight, C = structure weight, BONUS = value added to global probability (constraint: A + B + C = 1)
    For I = 1 To numWords
        wordsProbability(I, 1) = 0
        wordsProbability(I, 2) = 0
        wordsProbability(I, 3) = 0
        wordsProbability(I, 1) = probability(Len(newWord$), Len(words(I))) / probability(0, 0)
        wordsProbability(I, 3) = structureLikeness(newWord$, words(I))
    Next I
    searchLinks (oldword$)
    For I = 1 To numSearchedCountedLinks
        position = wordPosition(searchedCountedLinks(I, 2))
        wordsProbability(position, 2) = Val(searchedCountedLinks(I, 3)) / totSearchedCountedLinks
    Next I
    For I = 1 To numWords
        wordsProbability(I, 4) = Int((A * wordsProbability(I, 1) + B * wordsProbability(I, 2) + C * wordsProbability(I, 3)) * 10 ^ 4) / 10 ^ 4
        If wordsProbability(I, 1) = 1 And wordsProbability(I, 3) = 1 Then wordsProbability(I, 4) = min(1, wordsProbability(I, 4) + BONUS)
    Next I
    orderComputedWords
    shuffleWords
End Sub
 
Sub orderComputedWords
    Do
        changed = 0
        For I = 2 To numWords
            If wordsProbability(I, 4) > wordsProbability(I - 1, 4) Then
                buffer$ = words(I - 1)
                buffer1 = wordsProbability(I - 1, 1)
                buffer2 = wordsProbability(I - 1, 2)
                buffer3 = wordsProbability(I - 1, 3)
                buffer4 = wordsProbability(I - 1, 4)
                words(I - 1) = words(I)
                wordsProbability(I - 1, 1) = wordsProbability(I, 1)
                wordsProbability(I - 1, 2) = wordsProbability(I, 2)
                wordsProbability(I - 1, 3) = wordsProbability(I, 3)
                wordsProbability(I - 1, 4) = wordsProbability(I, 4)
                words(I) = buffer$
                wordsProbability(I, 1) = buffer1
                wordsProbability(I, 2) = buffer2
                wordsProbability(I, 3) = buffer3
                wordsProbability(I, 4) = buffer4
                changed = 1
            End If
        Next I
    Loop Until changed = 0
End Sub
 
Sub shuffleWords
    For I = 2 To numWords
        If wordsProbability(I, 4) = wordsProbability(I - 1, 4) Then
            buffer$ = words(I - 1)
            buffer1 = wordsProbability(I - 1, 1)
            buffer2 = wordsProbability(I - 1, 2)
            buffer3 = wordsProbability(I - 1, 3)
            buffer4 = wordsProbability(I - 1, 4)
            words(I - 1) = words(I)
            wordsProbability(I - 1, 1) = wordsProbability(I, 1)
            wordsProbability(I - 1, 2) = wordsProbability(I, 2)
            wordsProbability(I - 1, 3) = wordsProbability(I, 3)
            wordsProbability(I - 1, 4) = wordsProbability(I, 4)
            words(I) = buffer$
            wordsProbability(I, 1) = buffer1
            wordsProbability(I, 2) = buffer2
            wordsProbability(I, 3) = buffer3
            wordsProbability(I, 4) = buffer4
        End If
    Next I
End Sub
 
Function wordPosition (searchedWord$)
    For I = 1 To numWords
        If words(I) = searchedWord$ Then
            wordPosition = I
            Exit For
        End If
    Next I
End Function
 
Sub collectWords
    oldstr$ = countedLinks(1, 1)
    j = 1
    For i = 2 To numCountedLinks
        If Not countedLinks(i, 1) = oldstr$ Then
            If Not isSpecial(oldstr$) Then
                words(j) = oldstr$
                j = j + 1
            End If
            oldstr$ = countedLinks(i, 1)
        End If
    Next i
    If Not isSpecial(oldstr$) Then
        words(j) = oldstr$
        numWords = j
    Else
        numWords = j - 1
    End If
End Sub
 
Function structureLikeness (myWord$, vocabularyWord$)
    Dim totDistance As Double: totDistance = 0
    For I = 1 To Len(myWord$)
        distance = charDistance(Mid$(myWord$, I, 1), vocabularyWord$, I)
        totDistance = totDistance + probability(distance, 0)
    Next I
    If Len(myWord$) = 0 Then
        structureLikeness = 0
    Else
        structureLikeness = totDistance / (Len(myWord$) * probability(0, 0))
    End If
End Function
 
Sub searchLinks (keyword$)
    For I = 1 To numCountedLinks
        searchedCountedLinks(I, 1) = ""
        searchedCountedLinks(I, 2) = ""
        searchedCountedLinks(I, 3) = ""
    Next I
    j = 0
    totSearchedCountedLinks = 0
    For I = 1 To numCountedLinks
        If countedLinks(I, 1) = keyword$ Then
            j = j + 1
            searchedCountedLinks(j, 1) = countedLinks(I, 1)
            searchedCountedLinks(j, 2) = countedLinks(I, 2)
            searchedCountedLinks(j, 3) = countedLinks(I, 3)
            totSearchedCountedLinks = totSearchedCountedLinks + Val(countedLinks(I, 3))
        End If
    Next I
    numSearchedCountedLinks = j
End Sub
 
Sub countLinks
    oldstr1$ = links(1, 1)
    oldstr2$ = links(1, 2)
    counter = 1
    j = 1
    For I = 2 To numLinks
        If links(I, 1) = oldstr1$ And links(I, 2) = oldstr2$ Then
            counter = counter + 1
        Else
            countedLinks(j, 1) = oldstr1$
            countedLinks(j, 2) = oldstr2$
            countedLinks(j, 3) = Str$(counter)
            oldstr1$ = links(I, 1)
            oldstr2$ = links(I, 2)
            j = j + 1
            counter = 1
        End If
    Next I
    countedLinks(j, 1) = oldstr1$
    countedLinks(j, 2) = oldstr2$
    countedLinks(j, 3) = Str$(counter)
    numCountedLinks = j
End Sub
 
Sub decomposeIntoLinks (sample$)
    sample$ = standardize$(sample$)
    Dim oldword As String
    Dim newword As String
    Dim extractedword As String
    For I = 1 To Len(sample$)
        j = InStr(I, sample$, " ")
        If j = 0 Then Exit For
        extractedword = Mid$(sample$, I - 1, j - I + 1)
        If Len(extracted) > 0 Then
            oldword = newword
            newword = extractedword
            If Not I = 1 Then
                row = row + 1
                links(row, 1) = _Trim$(oldword)
                links(row, 2) = _Trim$(newword)
            End If
        End If
        I = j + 1
    Next I
    numLinks = row
End Sub
 
Sub orderLinks
    Do
        changed = 0
        For I = 2 To numLinks
            result = compare(links(I - 1, 1) + Chr$(255) + links(I - 1, 2), links(I, 1) + Chr$(255) + links(I, 2))
            If result = 2 Then
                buffer1$ = links(I - 1, 1)
                buffer2$ = links(I - 1, 2)
                links(I - 1, 1) = links(I, 1)
                links(I - 1, 2) = links(I, 2)
                links(I, 1) = buffer1$
                links(I, 2) = buffer2$
                changed = 1
            End If
        Next I
    Loop Until changed = 0
End Sub
 
Function standardize$ (sample$)
    sample$ = "." + _Trim$(LCase$(sample$)) + ". "
    For i = 1 To Len(sample$)
        char$ = Mid$(sample$, i, 1)
        If isUnsupported(char$) Then char$ = " "
        If isPunctuation(char$) Then char$ = " " + char$ + " "
        standardize$ = standardize$ + char$
    Next i
End Function
 
Function destandardize$ (sample$, uppercase)
    If uppercase Then
        sample$ = UCase$(Left$(_Trim$(sample$), 1)) + Right$(_Trim$(sample$), Len(_Trim$(sample$)) - 1)
    Else
        sample$ = _Trim$(sample$)
    End If
    For i = 1 To Len(sample$)
        char$ = Mid$(sample$, i, 1)
        nextchar$ = Mid$(sample$, i + 1, 1)
        If char$ = " " And isPunctuation(nextchar$) Then
        ElseIf char$ = " " And nextchar$ = " " Then
        Else
            destandardize$ = destandardize$ + char$
        End If
        punctuation = isPunctuation(char$)
        If punctuation = -1 Then
            destandardize$ = destandardize$ + " " + destandardize$(Right$(sample$, Len(sample$) - i - 1), -1)
            Exit For
        ElseIf punctuation = -2 Then
            destandardize$ = destandardize$ + " " + destandardize$(Right$(sample$, Len(sample$) - i - 1), 0)
            Exit For
        End If
    Next i
End Function
 
Function probability (value, mean) ' with standard deviation = 2
    probability = cumulativeProbability(value + 0.5, mean, 2) - cumulativeProbability(value - 0.5, mean, 2)
End Function
 
Function cumulativeProbability (value, mean, deviation)
    value = (value - mean) / deviation
    adaptedvalue = Abs(Int(value * 100))
    If adaptedvalue <= 400 Then
        cumulativeProbability = GaussTable(adaptedvalue)
    Else
        cumulativeProbability = 1
    End If
    If value < 0 Then cumulativeProbability = 1 - cumulativeProbability
End Function
 
Sub fillGaussTable
    GaussTable(0) = 0.5000
    GaussTable(5) = 0.5199
    GaussTable(10) = 0.5398
    GaussTable(15) = 0.5596
    GaussTable(20) = 0.5793
    GaussTable(25) = 0.5987
    GaussTable(30) = 0.6179
    GaussTable(35) = 0.6368
    GaussTable(40) = 0.6554
    GaussTable(45) = 0.6736
    GaussTable(50) = 0.6915
    GaussTable(55) = 0.7088
    GaussTable(60) = 0.7257
    GaussTable(65) = 0.7421
    GaussTable(70) = 0.7580
    GaussTable(75) = 0.7734
    GaussTable(80) = 0.7881
    GaussTable(85) = 0.8023
    GaussTable(90) = 0.8159
    GaussTable(95) = 0.8289
    GaussTable(100) = 0.8413
    GaussTable(105) = 0.8531
    GaussTable(110) = 0.8643
    GaussTable(115) = 0.8749
    GaussTable(120) = 0.8849
    GaussTable(125) = 0.8944
    GaussTable(130) = 0.9032
    GaussTable(135) = 0.9115
    GaussTable(140) = 0.9192
    GaussTable(145) = 0.9265
    GaussTable(150) = 0.9332
    GaussTable(155) = 0.9394
    GaussTable(160) = 0.9452
    GaussTable(165) = 0.9505
    GaussTable(170) = 0.9554
    GaussTable(175) = 0.9599
    GaussTable(180) = 0.9641
    GaussTable(185) = 0.9678
    GaussTable(190) = 0.9713
    GaussTable(195) = 0.9744
    GaussTable(200) = 0.9772
    GaussTable(210) = 0.9821
    GaussTable(220) = 0.9861
    GaussTable(230) = 0.9893
    GaussTable(240) = 0.9918
    GaussTable(250) = 0.9938
    GaussTable(260) = 0.9953
    GaussTable(270) = 0.9965
    GaussTable(280) = 0.9974
    GaussTable(290) = 0.9981
    GaussTable(310) = 0.9990
    GaussTable(390) = 1
    GaussTable(400) = 1
    Dim lastValidValue As Double
    Dim lastValidValuePosition As Integer
    Dim nextValidValue As Double
    Dim nextValidValuePosition As Integer
    Dim interpolation As Double
    For i = 0 To 399
        If GaussTable(i) = 0 Then
            nextValidValue = 0
            j = i + 1
            While nextValidValue = 0
                If Not GaussTable(j) = 0 Then
                    nextValidValue = GaussTable(j)
                    nextValidValuePosition = j
                Else
                    j = j + 1
                End If
            Wend
            interpolation = Int((nextValidValue - lastValidValue) * (i - lastValidValuePosition) / (nextValidValuePosition - lastValidValuePosition) * 10 ^ 4) / 10 ^ 4
            GaussTable(i) = lastValidValue + interpolation
        Else
            lastValidValue = GaussTable(i)
            lastValidValuePosition = i
        End If
    Next i
End Sub
 
Function compare (str1$, str2$) ' returns 0 if str1$ = str2$, 1 if str1$ < str2$, 2 if str1$ > str2$
    If Len(str1$) < Len(str2$) Then compare = 1
    If Len(str1$) > Len(str2$) Then compare = 2
    For I = 1 To min(Len(str1$), Len(str2$))
        If Asc(Mid$(str1$, I, 1)) > Asc(Mid$(str2$, I, 1)) Then
            compare = 2
            Exit For
        End If
        If Asc(Mid$(str1$, I, 1)) < Asc(Mid$(str2$, I, 1)) Then
            compare = 1
            Exit For
        End If
    Next I
End Function
 
Function charDistance (char$, word$, from)
    charDistance = 10
    For I = 0 To min(max(from - 1, Len(word$) - from), 10)
        If from - I > 0 Then
            If Mid$(word$, from - I, 1) = char$ Then
                charDistance = I
                Exit For
            End If
        End If
        If from + I <= Len(word$) Then
            If Mid$(word$, from + I, 1) = char$ Then
                charDistance = I
                Exit For
            End If
        End If
    Next I
End Function
 
Function isSpecial (word$)
    isSpecial = 0
    If isPunctuation(word$) Or word$ = " " Or word$ = "" Then isSpecial = -1
End Function
 
Function isPunctuation (char$) ' returns -1 if next word is uppercase, otherwise returns -2
    isPunctuation = 0
    If char$ = "." Or char$ = "!" Or char$ = "?" Then isPunctuation = -1
    If char$ = "," Or char$ = ":" Or char$ = ";" Then isPunctuation = -2
End Function
 
Function isUnsupported (char$)
    isUnsupported = 0
    If char$ = "“" Or char$ = "”" Or char$ = Chr$(13) Or char$ = Chr$(0) Or char$ = "-" Or char$ = Chr$(34) Or char$ = "/" Or char$ = "(" Or char$ = ")" Or char$ = "^" Or char$ = "[" Or char$ = "]" Or char$ = "{" Or char$ = "}" Or char$ = "_" Or char$ = "<" Or char$ = ">" Then isUnsupported = -1
End Function
 
Function min (int1, int2)
    If int1 < int2 Then
        min = int1
    Else
        min = int2
    End If
End Function
 
Function max (int1, int2)
    If int1 > int2 Then
        max = int1
    Else
        max = int2
    End If
End Function
News:

Messages - PoliMi

Programs / Text Corrector

QB64 Discussion / Reading text file already open