NLP - Skip-gram Code & Annotations (Paddle)

NLP Learning Notes#01
import io
import os
import sys
import requests
from collections import OrderedDict 
import math
import random
import numpy as np
import paddle
from paddle.nn import Embedding
import paddle.nn.functional as F
import paddle.nn as nn

# -------------------data loading part-------------------

'''
# load text8 dataset, which is appropriate for corpus training
def download_text8 ():
    url = "https://dataset.bj.bcebos.com/word2vec/text8.txt"
    # use 'requests' lib to 'get' the data
    web = requests.get (url)
    corpus = web.content
    with open ('./Practice/NLP/text8.txt', 'wb') as f:
        f.write (corpus)
    f.close ()

download_text8 ()
'''

print ("Start Loading Text8")

def load_text8 ():
    with open ('./Practice/NLP/text8.txt', 'r') as f:
        # strip function delete the corresponding string on head and tail
        corpus = f.read ().strip ("\n").strip ()
    f.close ()
    return corpus

# corpus indicates the content of text8
corpus = load_text8 ()

print ("End Loading Text8")

# pre-processing, split the words
def word_split (corpus):
    # normalization: turn all character to lowercase
    # then split by spaces
    corpus = corpus.lower ().split (" ")
    return corpus

# it is a list now
corpus = word_split (corpus)

print ("Start Building Dictionary")

# dictionary construction & ID assignment
# higher frequency, lower ID (frequency of occurrence)
def build_dict (corpus):
    word_freq = dict ()
    for word in corpus:
        # remember to initialize, or it would be corrupt
        if not word in word_freq:
            word_freq[word] = 0
        word_freq[word] += 1
    
    # sort by frequency (dictionary could as well been sorted)
    # dict.items () returns a list of tuples like [('a', 1), ('b', 2), ...]
    # key = lambda dic: dic[1] means we sort by dic[1] which indicates the frequency
    word_freq = sorted (word_freq.items (), key = lambda dic: dic[1], reverse = True)

    # three mappings
    word2id_dict = dict ()
    id_freq = dict ()
    id2word_dict = dict ()
    for word, freq in word_freq:
        id = len (word2id_dict)
        word2id_dict[word] = id
        id_freq[id] = freq
        id2word_dict[id] = word
    
    return word2id_dict, id_freq, id2word_dict

word2id_dict, id_freq, id2word_dict = build_dict (corpus)
vocab_size = len (word2id_dict)

print ("End Building Dictionary")
print ("Start Converting to ID")

# replace words with their corresponding id s.t. nn could deal with data more easily
def convert_corpus2id (corpus, word2id_dict):
    corpus = [word2id_dict[word] for word in corpus]
    return corpus

corpus = convert_corpus2id (corpus, word2id_dict)

print ("End Converting to ID")
print ("Start Subsampling")

# !important
# subsampling: discard high-frequency words in corpus to enhance training
def subsampling (corpus, id_freq):
    # randomly dicard words in corpus
    # higher frequency, higher probability to be abondaned
    def discard (id):
        # random.uniform (a, b) returns a random real number between a & b
        return random.uniform (0, 1) < 1 - math.sqrt (1e-4 / id_freq[id] * len (corpus))
    
    corpus = [word for word in corpus if not discard (word)]
    return corpus

corpus = subsampling (corpus, id_freq)

print ("End Subsampling")

# -----------------end data loading part-----------------

# use negative sampling to replace softmax s.t. training speed would increase
# max_window_size indicates the maximum size of the moving window
# neg_samp_num indicates how many words we are to pick in negative sampling as negative samples
print ("Start Building Dataset...")
def build_dataset (corpus, max_window_size = 3, neg_samp_num = 4):
    dataset = []
    # format: (center_word, current_word, label(1 for positive, 0 for negative))

    for center_word_idx in range (len (corpus)):
        # randomly pick a window_size to enhance training
        window_size = random.randint (1, max_window_size)
        center_word = corpus[center_word_idx]

        # select positive samples
        posi_word_range = (max (0, center_word_idx - window_size), \
                    min (len (corpus) - 1, center_word_idx + window_size))
        posi_word_set = [corpus[i] for i in range (posi_word_range[0], posi_word_range[1] + 1) if i != center_word_idx]

        for posi_word in posi_word_set:
            dataset.append ((center_word, posi_word, 1))

            # randomly pick negative samples
            cnt = 0
            while cnt < neg_samp_num:
                neg_word = random.randint (0, vocab_size - 1)
                if not neg_word in posi_word_set:
                    dataset.append ((center_word, neg_word, 0))
                    cnt += 1
    
    return dataset

corpus = corpus[:int (len (corpus) * 0.2)]
dataset = build_dataset (corpus, 3, 4)

print ("End Building Dataset")

def build_batch (dataset, batch_size, epoch_num):
    center_batch = [] # save center words
    target_batch = [] # save target words
    label_batch = [] # save labels

    for epoch in range (epoch_num):
        # remember to shuffle the dataset every epoch
        random.shuffle (dataset)

        for center_word, target_word, label in dataset:
            center_batch.append ([center_word])
            target_batch.append ([target_word])
            label_batch.append (label)

            if len (center_batch) == batch_size:
                yield np.array (center_batch).astype ('int64'), \
                      np.array (target_batch).astype ('int64'), \
                      np.array (label_batch).astype ('float32')
                center_batch = []
                target_batch = []
                label_batch = []
        
    if len (center_batch) > 0:
        yield np.array (center_batch).astype ('int64'), \
              np.array (target_batch).astype ('int64'), \
              np.array (label_batch).astype ('float32')

class SkipGram (paddle.nn.Layer):
    # init_scale define the range of initial values of word embeddings
    def __init__ (self, vocab_size, embed_size, init_scale = 0.1):
        super (SkipGram, self).__init__ ()
        self.vocab_size = vocab_size
        self.embed_size = embed_size

        # note that actually self.embedding_in is the same as self.embedding_out
        # but in skip-gram network graph we see that the shape of self.embedding_out is supposed to be \
        # [embedding_size, vocabulary_size]
        # however, it is not used in actual programming
        # cuz we do not do matrix multiplication in order
        # we do it for center word's one-hot embedding and the same time for target words'
        # then mix them up
        self.embedding_in = Embedding (num_embeddings = self.vocab_size, embedding_dim = self.embed_size, \
                                weight_attr = paddle.ParamAttr (
                                    initializer = paddle.nn.initializer.Uniform (
                                        low = - init_scale, high = init_scale)))
        self.embedding_out = Embedding (num_embeddings = self.vocab_size, embedding_dim = self.embed_size, \
                                weight_attr = paddle.ParamAttr (
                                    initializer = paddle.nn.initializer.Uniform (
                                        low = - init_scale, high = init_scale)))
        # about paddle.nn.embedding API
        # we do not need to input one-hot encoding tensor
        # what is supposed to be input is batches of word index
        # Let's see an example given by PP's official website
        # ---------------------------------------
        # x是Tensor， 且padding_idx = -1.
        #     padding_idx = -1
        #     x.data = [[1, 3], [2, 4], [4, 127]]
        #     x.shape = [3, 2]
        #     weight.shape = [128, 16]
        # 输出是Tensor:
        #     out.shape = [3, 2, 16]
        #     out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
        #                 [0.345421456, 0.524563927, ..., 0.144534654]],
        #                 [[0.345249859, 0.124939536, ..., 0.194353745],
        #                 [0.945345345, 0.435394634, ..., 0.435345365]],
        #                 [[0.945345345, 0.435394634, ..., 0.435345365],
        #                 [0.0,         0.0,         ..., 0.0        ]]]  # padding data
        # 输入的padding_idx小于0，则自动转换为padding_idx = -1 + 128 = 127, 对于输入id为127的词，进行padding处理。
        # ---------------------------------------
        # it can be seen that words are directly converted to their word embeddings
    
    def forward (self, center_words, target_words):
        center_words_emb = self.embedding_in (center_words)
        target_words_emb = self.embedding_out (target_words)

        # note that it should be matter with cosine distance instead of Euclidean distance
        # cosine distance = (x1y1 + x2y2) / sqrt((sigma(x^2)sigma(y^2))
        word_eval = paddle.multiply (center_words_emb, target_words_emb)
        word_eval = paddle.sum (word_eval, axis = - 1)
        # when shape = [- 1], no matter how many dimensions a tensor has
        # they'll all be turned to a sequence, e.g. [1, 2, 3, 4, ...]
        word_eval = paddle.reshape (word_eval, shape = [- 1])

        return word_eval

# used for evaluating synonym
# return the first k-th possible words
# current_embedding saves the word embedding table trained so far
def acquire_synonym (id, k, current_embedding):
    W = current_embedding.numpy ()
    x = W[word2id_dict[id]]
    # calculate cosine dist between current node and remaining nodes
    cos_dist = np.dot (W, x) / np.sqrt (np.sum (W * W, axis = 1) * np.sum (x * x) + 1e-9)
    # flatten function is used as the same as paddle.reshape (x, shape = [- 1])
    cos_dist = cos_dist.flatten ()
    # np.argpartition can place values lower then the k-th value to left side and higher ones to right side
    # the same as the partition done in quick sort
    # return indices instead of real values
    # - k means sort in decreasing order
    # [- k:] picks the largest k elements
    indices = np.argpartition (cos_dist, - k)[- k:]
    # argsort is a general sort function
    # negative symbol is used to sort from highest to lowest
    indices = indices[np.argsort (- cos_dist[indices])]
    # for i in indices:
    #     print('for word %s, the similar word is %s' % (query_token, str(id2word_dict[i])))

batch_size = 512
epoch_num = 3
embedding_size = 200

print ("Start Building Batch...")
train_loader = build_batch (dataset, batch_size, epoch_num)
print ("End Building Batch")

model = SkipGram (vocab_size, embedding_size)

def train (model):
    # use gpu
    use_gpu = False
    paddle.set_device('gpu:0') if use_gpu else paddle.set_device('cpu')

    optimizer = paddle.optimizer.Adam (learning_rate = 0.001, parameters = model.parameters ())
    for center_words, target_words, labels in train_loader:
        center_words = paddle.to_tensor (center_words)
        target_words = paddle.to_tensor (target_words)
        labels = paddle.to_tensor (labels)

        word_eval = model (center_words, target_words)

        # print (word_eval, labels)

        # directly use 'binary_cross_entropy_with_logits' to handle binary classification problem
        # the function would automatically pass para through sigmoid function
        # therefore, we only need to input values before sigmoid(ed)
        loss = F.binary_cross_entropy_with_logits (word_eval, labels)
        avg_loss = paddle.mean (loss)

        avg_loss.backward ()
        optimizer.step ()
        optimizer.clear_grad ()

train (model)
paddle.save (model.state_dict (), './Practice/NLP/skipgram.pt')