import numpy as npimport refrom tqdm import tqdmimport matplotlib.pyplot as pltimport tensorflow as tffrom keras.layers import Input, Densefrom keras.models import Modelfrom sklearn.metrics.pairwise import cosine_similarity, euclidean_distancesimport seaborn as snsfrom sklearn.manifold import MDS
Code
texts1 ="Word2vec is a technique for natural language processing (NLP) published in 2013. The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence. As the name implies, word2vec represents each distinct word with a particular list of numbers called a vector. The vectors are chosen carefully such that they capture the semantic and syntactic qualities of words; as such, a simple mathematical function (cosine similarity) can indicate the level of semantic similarity between the words represented by those vectors. Word2vec is a group of related models that are used to produce word embeddings. These models are shallow, two-layer neural networks that are trained to reconstruct linguistic contexts of words. Word2vec takes as its input a large corpus of text and produces a vector space, typically of several hundred dimensions, with each unique word in the corpus being assigned a corresponding vector in the space. Word2vec can utilize either of two model architectures to produce these distributed representations of words: continuously sliding bag-of-words (CBOW) or continuously sliding skip-gram. In both architectures, word2vec considers both individual words and a sliding context window as it iterates over the corpus. The CBOW can be viewed as a ‘fill in the blank’ task, where the word embedding represents the way the word influences the relative probabilities of other words in the context window. Words which are semantically similar should influence these probabilities in similar ways, because semantically similar words should be used in similar contexts. The order of context words does not influence prediction (bag-of-words assumption). In the continuous skip-gram architecture, the model uses the current word to predict the surrounding window of context words.[1][2] The skip-gram architecture weighs nearby context words more heavily than more distant context words. According to the authors' note,[3] CBOW is faster while skip-gram does a better job for infrequent words. After the model has trained, the learned word embeddings are positioned in the vector space such that words that share common contexts in the corpus — that is, words that are semantically and syntactically similar — are located close to one another in the space.[1] More dissimilar words are located farther from one another in the space"
Code
texts2 ="GloVe, coined from Global Vectors, is a model for distributed word representation. The model is an unsupervised learning algorithm for obtaining vector representations for words. This is achieved by mapping words into a meaningful space where the distance between words is related to semantic similarity.[1] Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. It is developed as an open-source project at Stanford[2] and was launched in 2014. As log-bilinear regression model for unsupervised learning of word representations, it combines the features of two model families, namely the global matrix factorization and local context window methods.[3] GloVe can be used to find relations between words like synonyms, company-product relations, zip codes and cities, etc. However, the unsupervised learning algorithm is not effective in identifying homographs, that is, words with the same spelling and different meanings. This is as the unsupervised learning algorithm calculates a single set of vectors for words with the same morphological structure.[4] The algorithm is also used by the SpaCy library to build semantic word embedding features, while computing the top list words that match with distance measures such as cosine similarity and Euclidean distance approach.[5] GloVe was also used as the word representation framework for the online and offline systems designed to detect psychological distress in patient interviews"
f"len of text1: {len(texts1)}, len of text2: {len(texts2)}"
Code
texts1
Code
texts2
Code
def text_preprocessing( TEXT: str, punctuations=r"""!()-[]{};:'"\,<>./?@#$%^&*_“~""", stop_words=None) ->list:""" A method to preprocess a text """if stop_words isNone: stop_words = ["and","a","is","the","in","be","will","was","but","this","were","with","of","also","on",".","for","any","its","and","are","from","both","as","to","these","—","‘","can","does","other","because","over","it","where", ]for x in TEXT.lower():if x in punctuations: TEXT = TEXT.replace(x, "")# Removing words that have numbers in them TEXT = re.sub(r"\w*\d\w*", "", TEXT)# Removing digits TEXT = re.sub(r"[0-9]+", "", TEXT)# Cleaning the whitespaces TEXT = re.sub(r"\s+", " ", TEXT).strip()# Setting every word to lower TEXT = TEXT.lower()# Converting all our text to a list TEXT = TEXT.split(" ")# Dropping empty strings TEXT = [x for x in TEXT if x !=""]# Dropping stop words TEXT = [x for x in TEXT if x notin stop_words]return TEXT
Code
def get_training_data(texts: list, window=2):# Defining the window for context# window = 2# Creating a placeholder for the scanning of the word list word_lists = [] all_text = []for text in texts:# Cleaning the text text = text_preprocessing(text)print(text)# Appending to the all text lists all_text += text# Creating a context dictionaryfor i, word inenumerate(text):for w inrange(window):# Getting the context that is ahead by *window* wordsif i +1+ w <len(text): word_lists.append([word] + [text[(i +1+ w)]])# Getting the context that is behind by *window* wordsif i - w -1>=0: word_lists.append([word] + [text[(i - w -1)]])return word_lists, list(set(all_text))
def create_unique_word_dict(TEXT: list):""" A method that creates a dictionary where the keys are unique words and key values are indices """# Getting all the unique words from our text and sorting them alphabetically Words =list(set(TEXT)) Words.sort()# Creating the dictionary for the unique words unique_word_dict = {}for i, Word inenumerate(Words): unique_word_dict.update({Word: i})return unique_word_dict, len(unique_word_dict)
# Getting all the unique wordsunique_word1 =list(unique_word_dict1.keys())
Code
unique_word1
Code
def one_hot_representation(word_lists, number_of_unique_words, unique_word_dict):# Creating the X and Y matrices using one hot encoding X = [] Y = []for i, word_list in tqdm(enumerate(word_lists)):# Getting the indices main_word_index = unique_word_dict.get(word_list[0]) context_word_index = unique_word_dict.get(word_list[1])print(word_list)print(word_list[0], main_word_index)print(word_list[1], context_word_index)# Creating the placeholders X_row = np.zeros(number_of_unique_words) Y_row = np.zeros(number_of_unique_words)# One hot encoding the main word X_row[main_word_index] =1# One hot encoding the Y matrix words Y_row[context_word_index] =1# Appending to the main matrices X.append(X_row) Y.append(Y_row)return X, Y
def CreateModel(input, output):# Defining the size of the embedding embed_size =2# Defining the neural network inp = Input(shape=input) # 21 x = Dense(units=embed_size, activation="linear")(inp) x = Dense(units=output, activation="softmax")(x) model = Model(inputs=inp, outputs=x) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics="accuracy") model.summary()return model
def create_embedding_word_dict(unique_word, unique_word_dict, weights):# get the weight for each unique word embedding_dict = {}for word in unique_word: # to pick a row of weight of two values for each unique word since weights = 21*2 embedding_dict.update({word: weights[unique_word_dict.get(word)]})return embedding_dict