1 word2vec训练
#coding=utf-8
import jieba
import jieba.posseg as pseg
import smart_open
import xlrd
import codecs
from gensim.models import Word2Vec
# 读文件
def read_file():
jieba.load_userdict("dic.txt")
user_query_segment_list = []
with open("07.txt", mode="r", encoding="utf-8") as f1:
lines = f1.readlines()
for line in lines:
line = line.replace("\n", "").strip()
every_query_list = []
# print(line)
words = pseg.cut(line)
words_list = list(words)
every_query_list = [element.word for element in words_list]
user_query_segment_list.append(every_query_list)
print(len(user_query_segment_list))
with open("zhishiku.txt", mode="r", encoding="utf-8") as f1:
lines = f1.readlines()
for line in lines:
line = line.replace("\n", "").strip()
every_query_list = []
# print(line)
words = pseg.cut(line)
words_list = list(words)
every_query_list = [element.word for element in words_list]
user_query_segment_list.append(every_query_list)
print(len(user_query_segment_list))
with open("seg_result.txt", mode="w", encoding="utf-8") as fw:
for element in user_query_segment_list:
temp = " ".join(list(element))
temp = temp.strip()
fw.writelines(temp + "\n")
return user_query_segment_list
def export_to_file(model, output_file):
output = codecs.open(output_file, 'w', 'utf-8')
print('done loading Word2Vec')
vocab = model.wv.vocab
for mid in vocab:
#print(model[mid])
#print(mid)
vector = list()
for dimension in model[mid]:
vector.append(str(dimension))
#line = { "mid": mid, "vector": vector }
vector_str = " ".join(vector)
line = mid + " " + vector_str
#line = json.dumps(line)
output.write(line + "\n")
output.close()
if __name__ == '__main__':
user_query_list = read_file()
# # user_query_list是list的list,里面是分好词的句子
# model = Word2Vec(user_query_list, size=100, window=5, min_count=1, workers=4, iter=10)
# # model.save('/tmp/MyModel')
# # model.save_word2vec_format('mymodel2.bin', binary=True)
# model.wv.save_word2vec_format('mymodel3.bin', binary=True)
# # print(model.wv.most_similar("公积金"))
# # export_to_file(model,"word2vec_by_gensim_ly_train_dev_test_0702.txt")
# model = Word2Vec()
# model.build_vocab(user_query_list)
# model.train(user_query_list, total_examples = model.corpus_count, epochs = 10)
# print(model.wv.most_similar("公积金"))
# model.wv.save_word2vec_format('mymodel2.bin', binary=True)
# # model.save('mymodel2.bin')
# # https: // www.jianshu.com / p / 05fb666a72c4
2 word2vec使用
import gensim
from gensim.models import word2vec
model = gensim.models.KeyedVectors.load_word2vec_format("skipgram_200_mincount10.vec")
print(model.similarity("日期", "时间"))
print(model.similarity("日期", "延期"))
3 fasttext使用