# word2vec 模型思想和代码实现

### CS224d－Day 3:

word2vec 有两个模型，CBOW 和 Skip-Gram，今天先讲 Skip-Gram 的算法和实现。

Skip-Gram 能达到什么效果？

Skip-gram 算法如下

``````def test_word2vec():

dataset = type('dummy', (), {})()     #create a dynamic object and then add attributes to it
def dummySampleTokenIdx():          #generate 1 integer between (0,4)
return random.randint(0, 4)

def getRandomContext(C):                            #getRandomContext(3) = ('d', ['d', 'd', 'd', 'e', 'a', 'd'])
tokens = ["a", "b", "c", "d", "e"]
for i in xrange(2*C)]

dataset.sampleTokenIdx = dummySampleTokenIdx        #add two methods to dataset
dataset.getRandomContext = getRandomContext

random.seed(31415)
np.random.seed(9265)                                #can be called again to re-seed the generator

#in this test, this wordvectors matrix is randomly generated,
#but in real training, this matrix is a well trained data
dummy_vectors = normalizeRows(np.random.randn(10,3))                    #generate matrix in shape=(10,3),
dummy_tokens = dict([("a",0), ("b",1), ("c",2), ("d",3), ("e",4)])      #{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}

print "==== Gradient check for skip-gram ===="
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5), dummy_vectors)  #vec is dummy_vectors

print "\n=== Results ==="
print skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5, :], dummy_vectors[5:, :], dataset)

if __name__ == "__main__":
test_word2vec()
``````

dummy_vectors－就是要求的两个 W，只不过合成一个矩阵形式了，初始化是随机生成
dummy_tokens－一个字典，用来表示词窗里的单词和位置

``````def word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C, word2vecCostAndGradient = softmaxCostAndGradient):
batchsize = 50
cost = 0.0
grad = np.zeros(wordVectors.shape)   #each element in wordVectors has a gradient
N = wordVectors.shape[0]
inputVectors = wordVectors[:N/2, :]
outputVectors = wordVectors[N/2:, :]

for i in xrange(batchsize):                                 #train word2vecModel for 50 times
C1 = random.randint(1, C)
centerword, context = dataset.getRandomContext(C1)      #randomly choose 1 word, and generate a context of it

if word2vecModel = skipgram:
denom = 1
else:
denom = 1

c, gin, gout = word2vecModel(centerword, C1, context, tokens, inputVectors, outputVectors, dataset, word2vecCostAndGradient)
cost += c / batchsize / denom                           #calculate the average
grad[:N/2, :] += gin / batchsize / denom
grad[N/2:, :] += gout / batchsize / denom

``````

word2vecCostAndGradient 先看 softmax 计算的，其实 模型可以有 skipgram 和 cbow 两种选择，word2vecCostAndGradient 可以有 softmax 和 negative sampling 两种选择，所以 word2vec 一共4种组合形式，今天先写 skipgram＋softmax 的，把一个弄明白，其他的就好理解了：

``````def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
""" Skip-gram model in word2vec """

currentI = tokens[currentWord]                      #the order of this center word in the whole vocabulary
predicted = inputVectors[currentI, :]               #turn this word to vector representation

cost = 0.0
for cwd in contextWords:                            #contextWords is of 2C length
idx = tokens[cwd]
cc, gp, gg = word2vecCostAndGradient(predicted, idx, outputVectors, dataset)
cost += cc                                      #final cost/gradient is the 'sum' of result calculated by each word in context
gradIn[currentI, :] += gp

``````

``````def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
""" Softmax cost function for word2vec models """

probabilities = softmax(predicted.dot(outputVectors.T))
cost = -np.log(probabilities[target])

delta = probabilities
delta[target] -= 1

N = delta.shape[0]                                              #delta.shape = (5,)
D = predicted.shape[0]                                          #predicted.shape = (3,)
grad = delta.reshape((N, 1)) * predicted.reshape((1, D))
gradPred = (delta.reshape((1, N)).dot(outputVectors)).flatten()

``````

`grad = delta.reshape((N, 1)) * predicted.reshape((1, D))`就是

`gradPred = (delta.reshape((1, N)).dot(outputVectors)).flatten()`就是

ok，Skip-Gram 和 softmax gradient 的结合就写完了，之后再看到 几行简略的算法描述，应该自己也能写出完整的代码了。