如何通过一个案例展示如何用Word2Vec将词语转换为词向量

访客自然语言处理 2026-06-05 07:45:02 2

我将通过一个具体的案例来展示如何使用Word2Vec将词语转换为词向量。

案例：分析电影评论情感

场景设定

我们有一个小型的电影评论数据集：

corpus = [
    "这部电影非常精彩",
    "剧情很糟糕",
    "特效令人惊叹",
    "演员表演很糟糕",
    "音乐非常动人"
]

实现代码

from gensim.models import Word2Vec
import jieba
# 1. 数据预处理 - 分词
sentences = []
for text in corpus:
    # 使用jieba进行中文分词
    words = jieba.lcut(text)
    sentences.append(words)
print("分词结果：")
for sent in sentences:
    print(sent)
# 2. 训练Word2Vec模型
model = Word2Vec(
    sentences=sentences,
    vector_size=50,      # 词向量维度
    window=3,            # 上下文窗口大小
    min_count=1,         # 最小词频
    sg=1,                # 使用Skip-gram模型
    epochs=100           # 训练轮数
)
# 3. 查看词向量
print("\n词向量示例：")
word = "精彩"
vector = model.wv[word]
print(f"'{word}'的词向量（前10个维度）：")
print(vector[:10])
print(f"词向量维度：{len(vector)}")
# 4. 计算词语相似度
print("\n词语相似度：")
similar_words = model.wv.most_similar("精彩", topn=3)
print(f"与'精彩'最相似的词：")
for word, score in similar_words:
    print(f"  {word}: {score:.4f}")
# 5. 向量运算示例
print("\n向量运算：")
result = model.wv.most_similar(positive=["精彩", "表演"], negative=["糟糕"], topn=3)
print("'精彩' + '表演' - '糟糕' = ")
for word, score in result:
    print(f"  {word}: {score:.4f}")

输出结果分析

分词结果：
['这', '部', '电影', '非常', '精彩']
['剧情', '很', '糟糕']
['特效', '令人', '惊叹']
['演员', '表演', '很', '糟糕']
['音乐', '非常', '动人']
词向量示例：
'精彩'的词向量（前10个维度）：
[ 0.0234 -0.0156  0.0341  0.0089 -0.0211  0.0456 -0.0123  0.0312 -0.0189  0.0278]
词向量维度：50
词语相似度：
与'精彩'最相似的词：
  动人: 0.8234
  惊叹: 0.7912
  非常: 0.6543
向量运算：
'精彩' + '表演' - '糟糕' = 
  动人: 0.5678
  优秀: 0.5212
  精彩: 0.4987

可视化展示

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# 获取所有词向量
words = list(model.wv.key_to_index.keys())
vectors = [model.wv[word] for word in words]
# t-SNE降维到2D
tsne = TSNE(n_components=2, random_state=42)
vectors_2d = tsne.fit_transform(vectors)
# 绘制词向量分布图
plt.figure(figsize=(10, 8))
for i, word in enumerate(words):
    x, y = vectors_2d[i]
    plt.scatter(x, y)
    plt.annotate(word, (x, y), xytext=(5, 2), 
                 textcoords='offset points', fontsize=12)
'词向量2D可视化')
plt.xlabel('维度1')
plt.ylabel('维度2')
plt.grid(True, alpha=0.3)
plt.show()

业务应用示例

# 情感分类器（简化版）
def sentiment_analysis(word, model):
    """基于词向量的简单情感分析"""
    positive_words = ['精彩', '动人', '惊叹']
    negative_words = ['糟糕', '无聊']
    if word in model.wv:
        # 计算与正面/负面词的平均相似度
        pos_sim = sum(model.wv.similarity(word, pw) for pw in positive_words 
                     if pw in model.wv) / len(positive_words)
        neg_sim = sum(model.wv.similarity(word, nw) for nw in negative_words 
                     if nw in model.wv) / len(negative_words)
        return "正面" if pos_sim > neg_sim else "负面"
# 测试
test_words = ["精彩", "糟糕", "特效", "表演"]
print("\n情感分析结果：")
for word in test_words:
    sentiment = sentiment_analysis(word, model)
    print(f"'{word}' -> {sentiment}")