MH0386’s Jupyter Notebooks

Code

import numpy as np
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import seaborn as sns
from sklearn.manifold import MDS

Code

texts1 = "Word2vec is a technique for natural language processing (NLP) published in 2013. The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence. As the name implies, word2vec represents each distinct word with a particular list of numbers called a vector. The vectors are chosen carefully such that they capture the semantic and syntactic qualities of words; as such, a simple mathematical function (cosine similarity) can indicate the level of semantic similarity between the words represented by those vectors. Word2vec is a group of related models that are used to produce word embeddings. These models are shallow, two-layer neural networks that are trained to reconstruct linguistic contexts of words. Word2vec takes as its input a large corpus of text and produces a vector space, typically of several hundred dimensions, with each unique word in the corpus being assigned a corresponding vector in the space. Word2vec can utilize either of two model architectures to produce these distributed representations of words: continuously sliding bag-of-words (CBOW) or continuously sliding skip-gram. In both architectures, word2vec considers both individual words and a sliding context window as it iterates over the corpus. The CBOW can be viewed as a ‘fill in the blank’ task, where the word embedding represents the way the word influences the relative probabilities of other words in the context window. Words which are semantically similar should influence these probabilities in similar ways, because semantically similar words should be used in similar contexts. The order of context words does not influence prediction (bag-of-words assumption). In the continuous skip-gram architecture, the model uses the current word to predict the surrounding window of context words.[1][2] The skip-gram architecture weighs nearby context words more heavily than more distant context words. According to the authors' note,[3] CBOW is faster while skip-gram does a better job for infrequent words. After the model has trained, the learned word embeddings are positioned in the vector space such that words that share common contexts in the corpus — that is, words that are semantically and syntactically similar — are located close to one another in the space.[1] More dissimilar words are located farther from one another in the space"

Code

texts2 = "GloVe, coined from Global Vectors, is a model for distributed word representation. The model is an unsupervised learning algorithm for obtaining vector representations for words. This is achieved by mapping words into a meaningful space where the distance between words is related to semantic similarity.[1] Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. It is developed as an open-source project at Stanford[2] and was launched in 2014. As log-bilinear regression model for unsupervised learning of word representations, it combines the features of two model families, namely the global matrix factorization and local context window methods.[3] GloVe can be used to find relations between words like synonyms, company-product relations, zip codes and cities, etc. However, the unsupervised learning algorithm is not effective in identifying homographs, that is, words with the same spelling and different meanings. This is as the unsupervised learning algorithm calculates a single set of vectors for words with the same morphological structure.[4] The algorithm is also used by the SpaCy library to build semantic word embedding features, while computing the top list words that match with distance measures such as cosine similarity and Euclidean distance approach.[5] GloVe was also used as the word representation framework for the online and offline systems designed to detect psychological distress in patient interviews"

Code

texts1 = texts1.split(".")
texts2 = texts2.split(".")

Code

f"len of text1: {len(texts1)}, len of text2: {len(texts2)}"

Code

texts1

Code

texts2

Code

def text_preprocessing(
    TEXT: str, punctuations=r"""!()-[]{};:'"\,<>./?@#$%^&*_“~""", stop_words=None
) -> list:
    """
    A method to preprocess a text
    """
    if stop_words is None:
        stop_words = [
            "and",
            "a",
            "is",
            "the",
            "in",
            "be",
            "will",
            "was",
            "but",
            "this",
            "were",
            "with",
            "of",
            "also",
            "on",
            ".",
            "for",
            "any",
            "its",
            "and",
            "are",
            "from",
            "both",
            "as",
            "to",
            "these",
            "—",
            "‘",
            "can",
            "does",
            "other",
            "because",
            "over",
            "it",
            "where",
        ]

    for x in TEXT.lower():
        if x in punctuations:
            TEXT = TEXT.replace(x, "")

    # Removing words that have numbers in them
    TEXT = re.sub(r"\w*\d\w*", "", TEXT)

    # Removing digits
    TEXT = re.sub(r"[0-9]+", "", TEXT)

    # Cleaning the whitespaces
    TEXT = re.sub(r"\s+", " ", TEXT).strip()

    # Setting every word to lower
    TEXT = TEXT.lower()

    # Converting all our text to a list
    TEXT = TEXT.split(" ")

    # Dropping empty strings
    TEXT = [x for x in TEXT if x != ""]

    # Dropping stop words
    TEXT = [x for x in TEXT if x not in stop_words]

    return TEXT

Code

def get_training_data(texts: list, window=2):
    # Defining the window for context
    # window = 2

    # Creating a placeholder for the scanning of the word list
    word_lists = []
    all_text = []
    for text in texts:
        # Cleaning the text
        text = text_preprocessing(text)
        print(text)

        # Appending to the all text lists
        all_text += text

        # Creating a context dictionary
        for i, word in enumerate(text):
            for w in range(window):
                # Getting the context that is ahead by *window* words
                if i + 1 + w < len(text):
                    word_lists.append([word] + [text[(i + 1 + w)]])
                # Getting the context that is behind by *window* words
                if i - w - 1 >= 0:
                    word_lists.append([word] + [text[(i - w - 1)]])
    return word_lists, list(set(all_text))

Code

word_lists1, all_text1 = get_training_data(texts1)

Code

word_lists1

Code

all_text1

Code

def create_unique_word_dict(TEXT: list):
    """
    A method that creates a dictionary where the keys are unique words
    and key values are indices
    """
    # Getting all the unique words from our text and sorting them alphabetically
    Words = list(set(TEXT))
    Words.sort()

    # Creating the dictionary for the unique words
    unique_word_dict = {}
    for i, Word in enumerate(Words):
        unique_word_dict.update({Word: i})

    return unique_word_dict, len(unique_word_dict)

Code

unique_word_dict1, number_of_unique_words1 = create_unique_word_dict(all_text1)

Code

unique_word_dict1

Code

number_of_unique_words1

Code

# Getting all the unique words
unique_word1 = list(unique_word_dict1.keys())

Code

unique_word1

Code

def one_hot_representation(word_lists, number_of_unique_words, unique_word_dict):
    # Creating the X and Y matrices using one hot encoding
    X = []
    Y = []

    for i, word_list in tqdm(enumerate(word_lists)):
        # Getting the indices
        main_word_index = unique_word_dict.get(word_list[0])
        context_word_index = unique_word_dict.get(word_list[1])

        print(word_list)
        print(word_list[0], main_word_index)
        print(word_list[1], context_word_index)

        # Creating the placeholders
        X_row = np.zeros(number_of_unique_words)
        Y_row = np.zeros(number_of_unique_words)

        # One hot encoding the main word
        X_row[main_word_index] = 1

        # One hot encoding the Y matrix words
        Y_row[context_word_index] = 1

        # Appending to the main matrices
        X.append(X_row)
        Y.append(Y_row)
    return X, Y

Code

matrix_x1, matrix_y1 = one_hot_representation(
    word_lists1, number_of_unique_words1, unique_word_dict1
)

Code

matrix_x1 = tf.convert_to_tensor(matrix_x1, dtype=tf.float32)
matrix_y1 = tf.convert_to_tensor(matrix_y1, dtype=tf.float32)
print(matrix_x1.shape)
print(matrix_y1.shape)

Code

def CreateModel(input, output):
    # Defining the size of the embedding
    embed_size = 2
    # Defining the neural network

    inp = Input(shape=input)  # 21
    x = Dense(units=embed_size, activation="linear")(inp)
    x = Dense(units=output, activation="softmax")(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics="accuracy")
    model.summary()
    return model

Code

model1 = CreateModel(matrix_x1.shape[1], matrix_y1.shape[1])

Code

model1.fit(x=matrix_x1, y=matrix_y1, epochs=2000)

Code

plt.figure(figsize=(10, 10))
plt.plot(model1.history.history["loss"])
plt.plot(model1.history.history["accuracy"])
plt.title("Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

Code

# The input layer

weights1 = model1.get_weights()[0]
print(weights1.shape)
# print(weights[1][1])
# print(weights)

# weights = model.get_weights()[2]
# print (weights)

Code

def create_embedding_word_dict(unique_word, unique_word_dict, weights):
    # get the weight for each unique word
    embedding_dict = {}
    for word in unique_word:  # to pick a row of weight of two values for each unique word since weights = 21*2
        embedding_dict.update({word: weights[unique_word_dict.get(word)]})
    return embedding_dict

Code

embedding_dict1 = create_embedding_word_dict(unique_word1, unique_word_dict1, weights1)

Code

embedding_dict1

Code

# from mpl_toolkits.mplot3d import Axes3D
# fig, axs = plt.subplots(len(embedding_dict.keys()), 3, figsize=(15, 10), subplot_kw={'projection': '3d'})
# print(len(embedding_dict.keys()))
# axs = axs.flatten()
# # Iterate through unique_word and plot in each subplot
# for i, word in enumerate(unique_word):
#     coord = embedding_dict.get(word)
#     ax = axs[i]

#     if coord is not None:
#         ax.scatter(coord[0], coord[1], coord[2], c='b', marker='o')
# ax.text(coord[0], coord[1], coord[2], s=word)
# ax.set_xlabel('X')
# ax.set_ylabel('Y')
# ax.set_zlabel('Z')
# ax.set_title(word)

# Remove empty subplots
# for j in range(len(unique_word), len(axs)):
#     fig.delaxes(axs[j])

# plt.tight_layout()
# plt.show()

# plt.figure(figsize=(10, 10))
# i = 0
# ax_list = []
# for word in unique_word:
#     #print(i, ' >> ', word)
#     coord = embedding_dict.get(word)
#     fig = plt.figure()
#     ax = fig.add_subplot( projection='3d')
#     ax.scatter(coord[0], coord[1], coord[2], c='b', marker='o')
#     ax.text(coord[0], coord[1], coord[2], s=word)
#     ax.set_xlabel('X')
#     ax.set_ylabel('Y')
#     ax.set_zlabel('Z')
#     ax.legend()
#     ax_list.append(ax)

#     # plt.scatter(coord[0], coord[1])
#     # plt.annotate(word, (coord[0], coord[1]))
#     i = i + 1
#     plt.legend(unique_word)
# plt.tight_layout()
# plt.show()

plt.figure(figsize=(10, 10))
for word in unique_word1:
    coord = embedding_dict1.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))

Code

plt.figure(figsize=(10, 10))
i = 0
weights1 = model1.get_weights()[0]
for word in list(unique_word_dict1.keys()):
    coord = embedding_dict1.get(word)
    if weights1[i][0] < 0 < weights1[i][1]:
        plt.scatter(0, weights1[i][1])
        plt.annotate(word, (0, weights1[i][1]))
    else:
        plt.scatter(weights1[i][0], weights1[i][1])
        plt.annotate(word, (weights1[i][0], weights1[i][1]))

Code

texts2

Code

word_lists2, all_text2 = get_training_data(texts2)

Code

word_lists2

Code

all_text2

Code

unique_word_dict2, number_of_unique_words2 = create_unique_word_dict(all_text2)

Code

unique_word_dict2

Code

number_of_unique_words2

Code

unique_word2 = list(unique_word_dict2.keys())

Code

unique_word2

Code

matrix_x2, matrix_y2 = one_hot_representation(
    word_lists2, number_of_unique_words2, unique_word_dict2
)

Code

matrix_x2 = tf.convert_to_tensor(matrix_x2, dtype=tf.float32)
matrix_y2 = tf.convert_to_tensor(matrix_y2, dtype=tf.float32)
print(matrix_x2.shape)
print(matrix_y2.shape)

Code

model2 = CreateModel(matrix_x2.shape[1], matrix_y2.shape[1])

Code

model2.fit(x=matrix_x2, y=matrix_y2, epochs=2000)

Code

plt.figure(figsize=(10, 10))
plt.plot(model2.history.history["loss"])
plt.plot(model2.history.history["accuracy"])
plt.title("Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

Code

weights2 = model2.get_weights()[0]

Code

weights2.shape

Code

embedding_dict2 = create_embedding_word_dict(unique_word2, unique_word_dict2, weights2)

Code

embedding_dict2

Code

plt.figure(figsize=(10, 10))
for word in unique_word2:
    coord = embedding_dict2.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))

Code

plt.figure(figsize=(10, 10))
i = 0
for word in list(unique_word_dict2.keys()):
    coord = embedding_dict2.get(word)
    if weights2[i][0] < 0 < weights2[i][1]:
        plt.scatter(0, weights2[i][1])
        plt.annotate(word, (0, weights2[i][1]))
    else:
        plt.scatter(weights2[i][0], weights2[i][1])
        plt.annotate(word, (weights2[i][0], weights2[i][1]))

Code

embedding1 = list(embedding_dict1.values())
embedding2 = list(embedding_dict2.values())

Code

cosineSimilarity = cosine_similarity(embedding1, embedding2)

Code

cosineSimilarity

Code

sns.heatmap(cosineSimilarity, cmap="YlGnBu")

Code

euclideanDistances = euclidean_distances(embedding1, embedding2)

Code

euclideanDistances

Code

sns.heatmap(euclideanDistances, cmap="YlGnBu")

Code

embedding1 = np.array(embedding1)
embedding2 = np.array(embedding2)
embedding = np.concatenate((embedding1, embedding2), axis=0)
embedding_s = np.cov(embedding)
embedding_s.shape

Code

mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(euclideanDistance)

xs, ys = pos[:, 0], pos[:, 1]

Code

plt.figure(figsize=(10, 10))
for x, y, name in zip(
    xs, ys, list(unique_word_dict1.keys()) + list(unique_word_dict2.keys())
):
    plt.scatter(x, y)
    plt.annotate(name, (x, y))

plt.show()