NLP Learning Notes#01

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import io
import os
import sys
import requests
from collections import OrderedDict
import math
import random
import numpy as np
import paddle
from paddle.nn import Embedding
import paddle.nn.functional as F
import paddle.nn as nn

# -------------------data loading part-------------------

'''
# load text8 dataset, which is appropriate for corpus training
def download_text8 ():
url = "https://dataset.bj.bcebos.com/word2vec/text8.txt"
# use 'requests' lib to 'get' the data
web = requests.get (url)
corpus = web.content
with open ('./Practice/NLP/text8.txt', 'wb') as f:
f.write (corpus)
f.close ()

download_text8 ()
'''

print ("Start Loading Text8")

def load_text8 ():
with open ('./Practice/NLP/text8.txt', 'r') as f:
# strip function delete the corresponding string on head and tail
corpus = f.read ().strip ("\n").strip ()
f.close ()
return corpus

# corpus indicates the content of text8
corpus = load_text8 ()

print ("End Loading Text8")

# pre-processing, split the words
def word_split (corpus):
# normalization: turn all character to lowercase
# then split by spaces
corpus = corpus.lower ().split (" ")
return corpus

# it is a list now
corpus = word_split (corpus)

print ("Start Building Dictionary")

# dictionary construction & ID assignment
# higher frequency, lower ID (frequency of occurrence)
def build_dict (corpus):
word_freq = dict ()
for word in corpus:
# remember to initialize, or it would be corrupt
if not word in word_freq:
word_freq[word] = 0
word_freq[word] += 1

# sort by frequency (dictionary could as well been sorted)
# dict.items () returns a list of tuples like [('a', 1), ('b', 2), ...]
# key = lambda dic: dic[1] means we sort by dic[1] which indicates the frequency
word_freq = sorted (word_freq.items (), key = lambda dic: dic[1], reverse = True)

# three mappings
word2id_dict = dict ()
id_freq = dict ()
id2word_dict = dict ()
for word, freq in word_freq:
id = len (word2id_dict)
word2id_dict[word] = id
id_freq[id] = freq
id2word_dict[id] = word

return word2id_dict, id_freq, id2word_dict

word2id_dict, id_freq, id2word_dict = build_dict (corpus)
vocab_size = len (word2id_dict)

print ("End Building Dictionary")
print ("Start Converting to ID")

# replace words with their corresponding id s.t. nn could deal with data more easily
def convert_corpus2id (corpus, word2id_dict):
corpus = [word2id_dict[word] for word in corpus]
return corpus

corpus = convert_corpus2id (corpus, word2id_dict)

print ("End Converting to ID")
print ("Start Subsampling")

# !important
# subsampling: discard high-frequency words in corpus to enhance training
def subsampling (corpus, id_freq):
# randomly dicard words in corpus
# higher frequency, higher probability to be abondaned
def discard (id):
# random.uniform (a, b) returns a random real number between a & b
return random.uniform (0, 1) < 1 - math.sqrt (1e-4 / id_freq[id] * len (corpus))

corpus = [word for word in corpus if not discard (word)]
return corpus

corpus = subsampling (corpus, id_freq)

print ("End Subsampling")

# -----------------end data loading part-----------------

# use negative sampling to replace softmax s.t. training speed would increase
# max_window_size indicates the maximum size of the moving window
# neg_samp_num indicates how many words we are to pick in negative sampling as negative samples
print ("Start Building Dataset...")
def build_dataset (corpus, max_window_size = 3, neg_samp_num = 4):
dataset = []
# format: (center_word, current_word, label(1 for positive, 0 for negative))

for center_word_idx in range (len (corpus)):
# randomly pick a window_size to enhance training
window_size = random.randint (1, max_window_size)
center_word = corpus[center_word_idx]

# select positive samples
posi_word_range = (max (0, center_word_idx - window_size), \
min (len (corpus) - 1, center_word_idx + window_size))
posi_word_set = [corpus[i] for i in range (posi_word_range[0], posi_word_range[1] + 1) if i != center_word_idx]

for posi_word in posi_word_set:
dataset.append ((center_word, posi_word, 1))

# randomly pick negative samples
cnt = 0
while cnt < neg_samp_num:
neg_word = random.randint (0, vocab_size - 1)
if not neg_word in posi_word_set:
dataset.append ((center_word, neg_word, 0))
cnt += 1

return dataset

corpus = corpus[:int (len (corpus) * 0.2)]
dataset = build_dataset (corpus, 3, 4)

print ("End Building Dataset")

def build_batch (dataset, batch_size, epoch_num):
center_batch = [] # save center words
target_batch = [] # save target words
label_batch = [] # save labels

for epoch in range (epoch_num):
# remember to shuffle the dataset every epoch
random.shuffle (dataset)

for center_word, target_word, label in dataset:
center_batch.append ([center_word])
target_batch.append ([target_word])
label_batch.append (label)

if len (center_batch) == batch_size:
yield np.array (center_batch).astype ('int64'), \
np.array (target_batch).astype ('int64'), \
np.array (label_batch).astype ('float32')
center_batch = []
target_batch = []
label_batch = []

if len (center_batch) > 0:
yield np.array (center_batch).astype ('int64'), \
np.array (target_batch).astype ('int64'), \
np.array (label_batch).astype ('float32')

class SkipGram (paddle.nn.Layer):
# init_scale define the range of initial values of word embeddings
def __init__ (self, vocab_size, embed_size, init_scale = 0.1):
super (SkipGram, self).__init__ ()
self.vocab_size = vocab_size
self.embed_size = embed_size

# note that actually self.embedding_in is the same as self.embedding_out
# but in skip-gram network graph we see that the shape of self.embedding_out is supposed to be \
# [embedding_size, vocabulary_size]
# however, it is not used in actual programming
# cuz we do not do matrix multiplication in order
# we do it for center word's one-hot embedding and the same time for target words'
# then mix them up
self.embedding_in = Embedding (num_embeddings = self.vocab_size, embedding_dim = self.embed_size, \
weight_attr = paddle.ParamAttr (
initializer = paddle.nn.initializer.Uniform (
low = - init_scale, high = init_scale)))
self.embedding_out = Embedding (num_embeddings = self.vocab_size, embedding_dim = self.embed_size, \
weight_attr = paddle.ParamAttr (
initializer = paddle.nn.initializer.Uniform (
low = - init_scale, high = init_scale)))
# about paddle.nn.embedding API
# we do not need to input one-hot encoding tensor
# what is supposed to be input is batches of word index
# Let's see an example given by PP's official website
# ---------------------------------------
# x是Tensor, 且padding_idx = -1.
# padding_idx = -1
# x.data = [[1, 3], [2, 4], [4, 127]]
# x.shape = [3, 2]
# weight.shape = [128, 16]
# 输出是Tensor:
# out.shape = [3, 2, 16]
# out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
# [0.345421456, 0.524563927, ..., 0.144534654]],
# [[0.345249859, 0.124939536, ..., 0.194353745],
# [0.945345345, 0.435394634, ..., 0.435345365]],
# [[0.945345345, 0.435394634, ..., 0.435345365],
# [0.0, 0.0, ..., 0.0 ]]] # padding data
# 输入的padding_idx小于0,则自动转换为padding_idx = -1 + 128 = 127, 对于输入id为127的词,进行padding处理。
# ---------------------------------------
# it can be seen that words are directly converted to their word embeddings

def forward (self, center_words, target_words):
center_words_emb = self.embedding_in (center_words)
target_words_emb = self.embedding_out (target_words)

# note that it should be matter with cosine distance instead of Euclidean distance
# cosine distance = (x1y1 + x2y2) / sqrt((sigma(x^2)sigma(y^2))
word_eval = paddle.multiply (center_words_emb, target_words_emb)
word_eval = paddle.sum (word_eval, axis = - 1)
# when shape = [- 1], no matter how many dimensions a tensor has
# they'll all be turned to a sequence, e.g. [1, 2, 3, 4, ...]
word_eval = paddle.reshape (word_eval, shape = [- 1])

return word_eval

# used for evaluating synonym
# return the first k-th possible words
# current_embedding saves the word embedding table trained so far
def acquire_synonym (id, k, current_embedding):
W = current_embedding.numpy ()
x = W[word2id_dict[id]]
# calculate cosine dist between current node and remaining nodes
cos_dist = np.dot (W, x) / np.sqrt (np.sum (W * W, axis = 1) * np.sum (x * x) + 1e-9)
# flatten function is used as the same as paddle.reshape (x, shape = [- 1])
cos_dist = cos_dist.flatten ()
# np.argpartition can place values lower then the k-th value to left side and higher ones to right side
# the same as the partition done in quick sort
# return indices instead of real values
# - k means sort in decreasing order
# [- k:] picks the largest k elements
indices = np.argpartition (cos_dist, - k)[- k:]
# argsort is a general sort function
# negative symbol is used to sort from highest to lowest
indices = indices[np.argsort (- cos_dist[indices])]
# for i in indices:
# print('for word %s, the similar word is %s' % (query_token, str(id2word_dict[i])))

batch_size = 512
epoch_num = 3
embedding_size = 200

print ("Start Building Batch...")
train_loader = build_batch (dataset, batch_size, epoch_num)
print ("End Building Batch")

model = SkipGram (vocab_size, embedding_size)

def train (model):
# use gpu
use_gpu = False
paddle.set_device('gpu:0') if use_gpu else paddle.set_device('cpu')

optimizer = paddle.optimizer.Adam (learning_rate = 0.001, parameters = model.parameters ())
for center_words, target_words, labels in train_loader:
center_words = paddle.to_tensor (center_words)
target_words = paddle.to_tensor (target_words)
labels = paddle.to_tensor (labels)

word_eval = model (center_words, target_words)

# print (word_eval, labels)

# directly use 'binary_cross_entropy_with_logits' to handle binary classification problem
# the function would automatically pass para through sigmoid function
# therefore, we only need to input values before sigmoid(ed)
loss = F.binary_cross_entropy_with_logits (word_eval, labels)
avg_loss = paddle.mean (loss)

avg_loss.backward ()
optimizer.step ()
optimizer.clear_grad ()

train (model)
paddle.save (model.state_dict (), './Practice/NLP/skipgram.pt')