kenlm

kenlm

1
2
3
4
5
6
7
8
9
10
11
import kenlm

## 将文件导入到 kenlm 语言模型中
model = kenlm.LanguageModel("/data/NLP/Language_Models/lm.bin")
# 使用语言模型对句子进行打分
sentence = 'you are a good man'
model.score(sentence)#-20.92301368713379
sentence = "I'm fine,thinks"
model.score(sentence)#-21.117055892944336
sentence = "wos as dadawnqsao asd aa aa aa"
model.score(sentence)#-46.037437438964844

py-kenlm-model

https://github.com/mattzheng/py-kenlm-model

旺旺教:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env python
import os
import kenlm

path = "/data_local/slm/chinese_csc_1268.bin"
model = kenlm.LanguageModel(path)
print('{0}-gram model'.format(model.order))

sentence = '今天 天气 很 好'
print(model.score(sentence))


# for item in model.full_scores(sentence):
# print(item)


# Check that total full score = direct score
def score(s):
return sum(prob for prob, _, _ in model.full_scores(s))


assert (abs(score(sentence) - model.score(sentence)) < 1e-3)

# Show scores and n-gram matches
words = ['<s>'] + sentence.split() + ['</s>']
for i, (prob, length, oov) in enumerate(model.full_scores(sentence)):
print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i + 2 - length:i + 2])))
if oov:
print('\t"{0}" is an OOV'.format(words[i + 1]))

# Find out-of-vocabulary words
for w in words:
if not w in model:
print('"{0}" is an OOV'.format(w))