TLG(九)

训练ngram G

1
2
3
# 训练LM
langname=ngram_7g_train_en_zh_hua_lexicon_word
local/train_lms_1gram.sh $langname/lexicon.1 $langname/text_split.biglettle $langname/lm_part_en5

生成 TLG种的 Lexicon.txt:

/home/yelong/data/wenet/examples/aishell/s0

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 生成lexicon.txt:
dict=exp/aban-c009/lang.char.txt
bpe_model=exp/aban-c009/bpe.model

unit_file=$dict
langname=ngram_7g_train_en_zh_hua_lexicon_word
mkdir -p data/local/dict_aban-c009_$langname
cp $unit_file data/local/dict_aban-c009_$langname/units.txt
# 把text_split.biglettle词频大于80的,放进词典,和声学的建模单元放一块,得到lexicon.1
# 去除声学词典里没有的OOV,中文拆成空格区分,英文bpe
python split_sentence_oov.py data/local/dict_aban-c009_$langname/units.txt /home/yelong/data/kaldi/egs/librispeech/s5/${langname}/lexicon.1 > data/local/dict_aban-c009_${langname}/lexicon.txt.nosplit

# PS 等拆分为 ▁P ▁S
python split_lexicon.py data/local/dict_aban-c009_${langname}/lexicon.txt.nosplit > data/local/dict_aban-c009_${langname}/lexicon.txt
c

用新的词典,对文本

其中,split_sentence_oov.py为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re
import sentencepiece as spm
import sys
sp = spm.SentencePieceProcessor()
sp.load("/home/yelong/data/wenet/examples/aishell/s0/exp/aban-c009/bpe.model")

unit_table = set()
with open(sys.argv[1], 'r', encoding='utf8') as fin:
for line in fin:
unit = line.split()[0]
unit_table.add(unit)


def contain_oov(units):
for unit in units:
if unit not in unit_table:
return True
return False


def __tokenize_by_bpe_model(txt):
tokens = []
# CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
pattern = re.compile(r'([\u4e00-\u9fff])')
# Example:
# txt = "你好 ITS'S OKAY 的"
# chars = ["你", "好", " ITS'S OKAY ", "的"]
chars = pattern.split(txt.upper())
mix_chars = [w for w in chars if len(w.strip()) > 0]
for ch_or_w in mix_chars:
# ch_or_w is a single CJK charater(i.e., "你"), do nothing.
if pattern.fullmatch(ch_or_w) is not None:
if contain_oov(ch_or_w):
return "yelong"
else:
tokens.append(ch_or_w)
# ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "),
# encode ch_or_w using bpe_model.
else:
for p in sp.encode_as_pieces(ch_or_w):
if contain_oov(ch_or_w):
return "yelong"
else:
tokens.append(p)

return tokens

#src_file='text_space_eng'
src_file=sys.argv[2]
with open(src_file, "r", encoding="utf8") as fs:
for line in fs:
line = line.strip()
temp = __tokenize_by_bpe_model(line)
if temp != "yelong":
print(line, " ".join(temp))


生成 TLG:

1
2
3
4
5
6
7
8
9
10
11
12
# 生成L.fst和T.fst:
langname=ngram_7g_train_en_zh_hua_lexicon_word
tools/fst/compile_lexicon_token_fst.sh \
data/local/dict_aban-c009_${langname} data/local/tmp_aban-c009_${langname} data/local/lang_aban-c009_${langname}

# 解压为lm.arpa
gunzip -c /home/yelong/data/kaldi/egs/librispeech/s5/$langname/lm_part_en5/srilm/srilm.o3g.kn.gz > /home/yelong/data/kaldi/egs/librispeech/s5/$langname/lm_part_en5/srilm/lm.arpa


# 生成G.fst、TLG.fst:
tools/fst/make_tlg.sh /home/yelong/data/kaldi/egs/librispeech/s5/$langname/lm_part_en5/srilm/ data/local/lang_aban-c009_$langname data/lang_aban-c009_$langname

解码测试集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 解码
export GLOG_logtostderr=1
export GLOG_v=3

langname=ngram_7g_train_en_zh_hua_lexicon_word


# docker:067:
#wav_dir=/home/data/yelong/docker_seewo/corpus/ftv-5000s/10-sub-ftv-1w/
#name=$(basename $wav_dir)
./tools/decode.sh --nj 10 --acoustic_scale 10 --lattice_beam 30 --max_active 7000 --ctc_weight 0.05 --rescoring_weight 1 --chunk_size -1 --blank_skip_thresh 0.98 --dict_path /home/aban-c009/lang_aban-c009_$langname/words.txt --fst_path /home/aban-c009/lang_aban-c009_$langname/TLG.fst /home/data/yelong/docker_seewo/corpus/200/doc/wav.scp /home/data/yelong/docker_seewo/corpus/200/doc/text /home/aban-c009/final.zip /home/aban-c009/lang_aban-c009_$langname/units.txt exp/aban-c009/200.11/lm_10_attention_rescore_ReduceBlankprob_1_OnlyCurBestEqualBlank