defnaive_dict_match(text, dictionary): matches = [] words = text.split() for word in words: if word in dictionary: matches.append(word) return matches
# 示例 text = "the quick brown fox jumps over the lazy dog" dictionary = {"the", "fox", "dog"} print(naive_dict_match(text, dictionary)) # 输出: ['the', 'fox', 'the', 'dog']
时间复杂度:O(n*m),其中n是文本长度,m是字典大小
(2) 基于哈希表的方法
利用哈希集合实现O(1)时间的单词查找:
1 2 3
defhash_dict_match(text, dictionary): word_set = set(dictionary) return [word for word in text.split() if word in word_set]
# 使用字典匹配的简单分词器 defdictionary_segment(text, dictionary): max_len = max(len(word) for word in dictionary) if dictionary else1 result = [] index = 0 n = len(text) while index < n: matched = False for size inrange(max_len, 0, -1): if index + size > n: continue word = text[index:index+size] if word in dictionary: result.append(word) index += size matched = True break ifnot matched: result.append(text[index]) index += 1 return result
classSensitiveFilter: def__init__(self, sensitive_words): self.trie = Trie() for word in sensitive_words: self.trie.insert(word) deffilter(self, text, replace_char="*"): result = [] i = 0 n = len(text) while i < n: node = self.trie.root j = i last_match = -1 while j < n and text[j] in node.children: node = node.children[text[j]] j += 1 if node.is_end: last_match = j if last_match != -1: result.append(replace_char * (last_match - i)) i = last_match else: result.append(text[i]) i += 1 return''.join(result)
defparallel_dict_match(text_chunks, dictionary): with Pool() as pool: results = pool.starmap( hash_dict_match, [(chunk, dictionary) for chunk in text_chunks] ) return [matchfor sublist in results formatchin sublist]
defnaive_dict_match(text, dictionary): matches = [] words = text.split() for word in words: if word in dictionary: matches.append(word) return matches
# 示例 text = "the quick brown fox jumps over the lazy dog" dictionary = {"the", "fox", "dog"} print(naive_dict_match(text, dictionary)) # 输出: ['the', 'fox', 'the', 'dog']
时间复杂度:O(n*m),其中n是文本长度,m是字典大小
(2) 基于哈希表的方法
利用哈希集合实现O(1)时间的单词查找:
1 2 3
defhash_dict_match(text, dictionary): word_set = set(dictionary) return [word for word in text.split() if word in word_set]
# 使用字典匹配的简单分词器 defdictionary_segment(text, dictionary): max_len = max(len(word) for word in dictionary) if dictionary else1 result = [] index = 0 n = len(text) while index < n: matched = False for size inrange(max_len, 0, -1): if index + size > n: continue word = text[index:index+size] if word in dictionary: result.append(word) index += size matched = True break ifnot matched: result.append(text[index]) index += 1 return result
classSensitiveFilter: def__init__(self, sensitive_words): self.trie = Trie() for word in sensitive_words: self.trie.insert(word) deffilter(self, text, replace_char="*"): result = [] i = 0 n = len(text) while i < n: node = self.trie.root j = i last_match = -1 while j < n and text[j] in node.children: node = node.children[text[j]] j += 1 if node.is_end: last_match = j if last_match != -1: result.append(replace_char * (last_match - i)) i = last_match else: result.append(text[i]) i += 1 return''.join(result)
defparallel_dict_match(text_chunks, dictionary): with Pool() as pool: results = pool.starmap( hash_dict_match, [(chunk, dictionary) for chunk in text_chunks] ) return [matchfor sublist in results formatchin sublist]