Chinese Error Correction Based on Python Fault Tolerant Prefix Tree
- 2021-11-14 06:11:08
- OfStack
Realization
Reference
Introduction
This paper uses Python to implement prefix tree, and supports editing distance fault-tolerant query. The prefix tree in this paper only stores three segmentation words in the format of (segmentation string, frequency), such as: ('Zhonghai Jinxiyuan', 2), ('Zhonghai Xiyuan', 24), ('Zhongnanhai', 4), which can be replaced by their own files. When querying, you should specify 1 string and the maximum fault-tolerant editing distance.
Realization
class Word:
def __init__(self, word, freq):
self.word = word
self.freq = freq
class Trie:
def __init__(self):
self.root = LetterNode('')
self.START = 3
def insert(self, word, freq):
self.root.insert(word, freq, 0)
def findAll(self, query, maxDistance):
suggestions = self.root.recommend(query, maxDistance, self.START)
return sorted(set(suggestions), key=lambda x: x.freq)
class LetterNode:
def __init__(self, char):
self.REMOVE = -1
self.ADD = 1
self.SAME = 0
self.CHANGE = 2
self.START = 3
self.pointers = []
self.char = char
self.word = None
def charIs(self, c):
return self.char == c
def insert(self, word, freq, depth):
if ' ' in word:
word = [i for i in word.split(' ')]
if depth < len(word):
c = word[depth].lower()
for next in self.pointers:
if next.charIs(c):
return next.insert(word, freq, depth + 1)
nextNode = LetterNode(c)
self.pointers.append(nextNode)
return nextNode.insert(word, freq, depth + 1)
else:
self.word = Word(word, freq)
def recommend(self, query, movesLeft, lastAction):
suggestions = []
length = len(query)
if length >= 0 and movesLeft - length >= 0 and self.word:
suggestions.append(self.word)
if movesLeft == 0 and length > 0:
for next in self.pointers:
if next.charIs(query[0]):
suggestions += next.recommend(query[1:], movesLeft, self.SAME)
break
elif movesLeft > 0:
for next in self.pointers:
if length > 0:
if next.charIs(query[0]):
suggestions += next.recommend(query[1:], movesLeft, self.SAME)
else:
suggestions += next.recommend(query[1:], movesLeft - 1, self.CHANGE)
if lastAction != self.CHANGE and lastAction != self.REMOVE:
suggestions += next.recommend(query, movesLeft - 1, self.ADD)
if lastAction != self.ADD and lastAction != self.CHANGE:
if length > 1 and next.charIs(query[1]):
suggestions += next.recommend(query[2:], movesLeft - 1, self.REMOVE)
elif length > 2 and next.charIs(query[2]) and movesLeft == 2:
suggestions += next.recommend(query[3:], movesLeft - 2, self.REMOVE)
else:
if lastAction != self.CHANGE and lastAction != self.REMOVE:
suggestions += next.recommend(query, movesLeft - 1, self.ADD)
return suggestions
def buildTrieFromFile():
trie = Trie()
rows = [(' Zhonghai Jinxi Garden ', 2),(' Zhonghai West Garden ', 24),(' Zhongnanhai ', 4)]
for row in rows:
trie.insert(row[0], int(row[1]))
return trie
def suggestor(trie, s, maxDistance):
if ' ' in s:
s = [x for x in s.split(' ')]
suggestions = trie.findAll(s, maxDistance)
return [str(x.word) for x in suggestions]
if __name__ == "__main__":
trie = buildTrieFromFile()
r = suggestor(trie, ' Zhonghai Jinxi Garden ', 1)
print(r)
Analysis
Result print:
['Zhonghai Jinxi Garden', 'Zhonghai West Garden']
It can be seen that "Zhonghai Jinxiyuan" is exactly the same string as the input, and the editing distance is 0, so it meets the requirement of maximum editing distance of 1 and returns directly.
"Zhonghai West Garden" is the result of removing the word "Jin" from "Zhonghai Jinxi Garden", and the editing distance is 1, so it meets the requirement of maximum editing distance of 1, and returns directly.
In addition, the editing distance between Zhongnanhai and Zhonghai Jinxiyuan is 4, which does not meet the requirement of maximum editing distance of 1, so it does not appear in the results.
Reference
https://github.com/leoRoss/AutoCorrectTrie