# # Start writing code here...
# 操作词袋模型:
# CountVectorizer:对语料库中出现的词汇进行词频统计,相当于词袋模型。
# 操作方式:将语料库当中出现的词汇作为特征,将词汇在当前文档中出现的频率(次数)作为特征值。
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
# 语料库
docs = np.array([
"猪肉 黄瓜 鸡蛋 木耳 盐 料酒 淀粉 植物油.",
"猪肉 土豆 葱 姜 蒜 面粉 料酒 白糖 醋 老抽.",
"猪肉 鸡蛋 面粉 盐 啤酒 酱油 白胡椒粉 料酒.",
"猪肉 姜 葱 小油菜 鸡蛋 生抽 老抽 料酒 糖 淀粉 香油.",
])
# bag是一个稀疏的矩阵。因为词袋模型就是一种稀疏的表示。
bag = count.fit_transform(docs)
# 输出单词与编号的映射关系。
print(count.vocabulary_)
# 调用稀疏矩阵的toarray方法,将稀疏矩阵转换为ndarray对象。
print(bag)
print(bag.toarray())
# where映射为编号8 there映射为编号5······
# 编号也是bag.toarray转换来的ndarray数组的索引
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')
!pip install gensim==4.1.2
!pip install jieba==0.42.1
sent1 = "猪肉 黄瓜 鸡蛋 木耳 盐 料酒 淀粉 植物油."
sent2 = "猪肉 鸡蛋 面粉 盐 啤酒 酱油 白胡椒粉 料酒."
# 分词
from nltk import word_tokenize
sents = [sent1, sent2]
texts = [[word for word in word_tokenize(sent)] for sent in sents]
print(sents)
print(texts)
# 构建语料库
all_list = []
for text in texts:
all_list += text
corpus = set(all_list)
print(corpus)
# 对语料库中的单词及标点建立数字映射
corpus_dict = dict(zip(corpus, range(len(corpus))))
print(corpus_dict)
# 建立句子的向量表示
def vector_rep(text, corpus_dict):
vec = []
for key in corpus_dict.keys():
if key in text:
vec.append((corpus_dict[key], text.count(key)))
else:
vec.append((corpus_dict[key], 0))
vec = sorted(vec, key=lambda x: x[0])
return vec
vec1 = vector_rep(texts[0], corpus_dict)
vec2 = vector_rep(texts[1], corpus_dict)
print(vec1)
print(vec2)
# 计算两个句子的相似度
from math import sqrt
def similarity_with_2_sents(vec1, vec2):
inner_product = 0
square_length_vec1 = 0
square_length_vec2 = 0
for tup1, tup2 in zip(vec1, vec2):
inner_product += tup1[1] * tup2[1]
square_length_vec1 += tup1[1] ** 2
square_length_vec2 += tup2[1] ** 2
return (inner_product / sqrt(square_length_vec1 * square_length_vec2))
cosine_sim = similarity_with_2_sents(vec1, vec2)
print('两个句子的余弦相似度为:%.4f' % cosine_sim)
import jieba
#定义停用词、标点符号
punctuation = [",","。", ":", ";", "?"]
#定义语料
content = ["猪肉(猪肚的上半部分)200克、豇豆100克调料1:醪糟汁200克、鲜汤20克、优质料酒50克、葱25克、姜25克、精盐75克、八角5克 。2:姜汁(由精盐3克、味精2克、姜末25克、酱油8克、醋20克、香油5克调匀而成)",
"牛柳1000克 料酒3汤匙(45ml) 大葱1根 姜1大块(约50克) 花椒50克 干红辣椒50克 生抽1汤匙(15ml) 老抽3汤匙(45ml) 白糖4汤匙(60克) 盐1茶匙(5克) 味精1/2茶匙(3克) 熟白芝麻随意",
"用料主料:填鸭1只(约2500克)配料:鸡蛋、淀粉、面粉、荷叶饼、葱条、萝卜条调料:盐、味精、白糖、老抽、甜面酱、蒜蓉、料酒、各种香料、骨头汤、色拉油、葱、姜",
"猪肉 黄瓜 鸡蛋 木耳 盐 料酒 淀粉 植物油",
"猪肉 鸡蛋 面粉 盐 啤酒 酱油 白胡椒粉 料酒 ",
]
#分词
segs_1 = [jieba.lcut(con) for con in content]
print(segs_1)
print("---------------------------------")
#去除停用词
tokenized = []
for sentence in segs_1:
words = []
for word in sentence:
if word not in punctuation:
words.append(word)
tokenized.append(words)
print(tokenized)
print("---------------------------------")
#求并集
bag_of_words = [ x for item in segs_1 for x in item if x not in punctuation]
#去重
bag_of_words = list(set(bag_of_words))
print(bag_of_words)
print("---------------------------------")
#词袋向量
bag_of_word2vec = []
for sentence in tokenized:
tokens = [1 if token in sentence else 0 for token in bag_of_words]
bag_of_word2vec.append(tokens)
print(bag_of_word2vec)
#仅通过菜谱的推荐
from gensim.models import Word2Vec
import jieba
#定义停用词、标点符号
punctuation = [",","。", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]
sentences = [
"猪肉(猪肚的上半部分)200克、豇豆100克调料1:醪糟汁200克、鲜汤20克、优质料酒50克、葱25克、姜25克、精盐75克、八角5克 。2:姜汁(由精盐3克、味精2克、姜末25克、酱油8克、醋20克、香油5克调匀而成)",
"牛柳1000克 料酒3汤匙(45ml) 大葱1根 姜1大块(约50克) 花椒50克 干红辣椒50克 生抽1汤匙(15ml) 老抽3汤匙(45ml) 白糖4汤匙(60克) 盐1茶匙(5克) 味精1/2茶匙(3克) 熟白芝麻随意",
"用料主料:填鸭1只(约2500克)配料:鸡蛋、淀粉、面粉、荷叶饼、葱条、萝卜条调料:盐、味精、白糖、老抽、甜面酱、蒜蓉、料酒、各种香料、骨头汤、色拉油、葱、姜",
"猪肉 黄瓜 鸡蛋 木耳 盐 料酒 淀粉 植物油",
"猪肉 鸡蛋 面粉 盐 啤酒 酱油 白胡椒粉 料酒 ",
]
sentences = [jieba.lcut(sen) for sen in sentences]
tokenized = []
for sentence in sentences:
words = []
for word in sentence:
if word not in punctuation:
words.append(word)
tokenized.append(words)
print(tokenized)
model = Word2Vec(tokenized, sg=1, vector_size=100, window=5, min_count=2, negative=1, sample=0.001, hs=1, workers=4)
model.save('model') # 保存模型
model = Word2Vec.load('model') # 加载模型
print(model.wv.similarity('猪肉', '猪肉'))
print(model.wv.similarity('猪肉', '料酒'))
# 预测和猪肉和鸡蛋最接近,而与红辣椒不接近的词,猪肉鸡蛋是他这一次购买的东西,然后瞬时推荐。参考他和其他客户之间的购物篮情况和菜单情况
print(model.wv.most_similar(positive=['猪肉','鸡蛋'], negative=['红辣椒']))
#加上购物篮的推荐:
from gensim.models import Word2Vec
import jieba
#定义停用词、标点符号
punctuation = [",","。", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]
sentences = [
"猪肉(猪肚的上半部分)200克、豇豆100克调料1:醪糟汁200克、鲜汤20克、优质料酒50克、葱25克、姜25克、精盐75克、八角5克 。2:姜汁(由精盐3克、味精2克、姜末25克、酱油8克、醋20克、香油5克调匀而成)",
"牛柳1000克 料酒3汤匙(45ml) 大葱1根 姜1大块(约50克) 花椒50克 干红辣椒50克 生抽1汤匙(15ml) 老抽3汤匙(45ml) 白糖4汤匙(60克) 盐1茶匙(5克) 味精1/2茶匙(3克) 熟白芝麻随意",
"用料主料:填鸭1只(约2500克)配料:鸡蛋、淀粉、面粉、荷叶饼、葱条、萝卜条调料:盐、味精、白糖、老抽、甜面酱、蒜蓉、料酒、各种香料、骨头汤、色拉油、葱、姜",
"猪肉 黄瓜 鸡蛋 木耳 盐 料酒 淀粉 植物油",
"猪肉 鸡蛋 面粉 盐 啤酒 酱油 白胡椒粉 料酒 ",
"鸡蛋,猪肉,酸奶",
"海天牌鲜味生抽王1.9L,白芸豆,有机菜花,芋头",
"维可得猪肝,葱油大饼,烤肠,皮冻",
]
sentences = [jieba.lcut(sen) for sen in sentences]
tokenized = []
for sentence in sentences:
words = []
for word in sentence:
if word not in punctuation:
words.append(word)
tokenized.append(words)
print(tokenized)
model = Word2Vec(tokenized, sg=1, vector_size=100, window=5, min_count=2, negative=1, sample=0.001, hs=1, workers=4)
model.save('model') # 保存模型
model = Word2Vec.load('model') # 加载模型
print(model.wv.similarity('猪肉', '猪肉'))
print(model.wv.similarity('猪肉', '料酒'))
# 预测与猪肉和母鸡蛋最接近,而与红辣椒不接近的词
print(model.wv.most_similar(positive=['猪肉','鸡蛋'], negative=['红辣椒']))
#positive是他的饮食偏好,negative是他的忌口
# !pip install -U pip
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding
import matplotlib.pyplot as plt
import os
import tensorflow.keras as kr
input_word = "psldmerf"
# p-猪肉 s-生抽 l-料酒 d-淀粉 m-面粉 e-鸡蛋 r-乳制品 f-粉条
w_to_id = {'p': 0, 's': 1, 'l': 2, 'd': 3, 'm': 4, 'e': 5, 'r': 6,'f': 7}
# 单词映射到数值id的词典
training_set_scaled = [0, 1, 2, 3, 4, 5, 6, 7]
x_train = []
y_train = []
for i in range(2, 8):
x_train.append(training_set_scaled[i - 2:i])
y_train.append(training_set_scaled[i])
# 使x_train符合Embedding输入要求:[送入样本数, 循环核时间展开步数] ,
x_train = np.reshape(x_train, (len(x_train), 2))
y_train = np.array(y_train)
model = tf.keras.Sequential([
Embedding(8, 2),
SimpleRNN(10),
Dense(8, activation='softmax')
])
# predict
preNum = int(input("input the number of test alphabet:"))
for i in range(preNum):
alphabet1 = input("input test alphabet:")
alphabet = [w_to_id[a] for a in alphabet1]
# 使alphabet符合Embedding输入要求:[送入样本数, 时间展开步数]。
alphabet = np.reshape(alphabet, (1, 2))
result = model.predict([alphabet])
pred = tf.argmax(result, axis=1)
pred = int(pred)
tf.print(alphabet1 + '->' + input_word[pred])
from sklearn import metrics
y_true = [1, 0, 1, 0, 1]#黄瓜 料酒 面粉 生抽 啤酒
y_pred1 = [0, 1, 0, 0, 1]#啤酒 胡椒粉 荷叶 淀粉(使用仅菜单推荐的结果)
y_pred2 = [1, 1, 0, 0, 1]#黄瓜,植物油,木耳,胡椒粉,啤酒,面粉,淀粉 (使用菜单和购物篮综合推荐)
print('Precision1',metrics.precision_score(y_true, y_pred1))
print('Recall1',metrics.recall_score(y_true, y_pred1))
print('F1-score1:',metrics.f1_score(y_true, y_pred1))
print('Precision2',metrics.precision_score(y_true, y_pred2))
print('Recall2',metrics.recall_score(y_true, y_pred2))
print('F1-score2:',metrics.f1_score(y_true, y_pred2))