Multi-Label Genre Classification-Part II

In this part, we'll be looking at some more preprocessing and BERT model training using the Huggingface library

Multi-Label Genre Classification-Part II

Table of Contents

  1. Import Libraries

  2. Data and Labels

    2.1. Frequency Distribution of Genre Counts

    2.2 Genre Distribution of Data

  3. Text Preprocessing

  4. The Model

  5. Training

  6. Checking on a stray example

  7. Checking on User Input

1. Import Libraries

In this section we will load and install all the libraries that we will require in the course of this project

import json
import tarfile
import pandas as pd
import re
import string
import os
from tqdm import tqdm
from pattern.text.en import singularize
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizerFast, TFBertModel
import numpy as np
import tempfile
import sys
import subprocess
from datetime import datetime
from packaging import version
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

2. Data and Labels

Load the preprocessed dataframe in order to obtain the data and its labels

PATH=<YOUR_PATH>
preprocessed_df = pd.read_csv(os.path.join(PATH, 'summary_and_labels.csv'))
data = preprocessed_df.loc[:, 'Movie_Summary'].to_numpy().reshape(-1, 1)
labels = preprocessed_df.iloc[:, 3:].to_numpy()

2.1 Frequency Distribution of Genre Counts

Observed the number of genres that the examples usually contain

unique, counts = np.unique(np.sum(labels, axis = 1), return_counts = True)
plt.figure(figsize = (15, 10))
plt.bar(unique, counts);
plt.xticks(unique);
plt.yticks(np.arange(0, 10000, 500))
plt.title('Frequency Distribution of Genre Counts');
plt.grid(axis = 'both')

2.2 Genre Distribution of Data

Observe the genres most frequent in our dataset

plt.figure(figsize = (15, 10))
plt.bar(np.arange(1, 34, 1), np.sum(labels, axis = 0));
plt.xticks(np.arange(1,34,1), labels = preprocessed_df.iloc[:, 3:].columns, rotation = 90);
plt.yticks(np.arange(0, 21000, 1000))
plt.title('Genre Distribution of Data');
plt.grid(axis = 'both')

3. Text Preprocessing

Preprocess the text in order to be suitable for inputting into the model (defined later)

mean_seq_length = preprocessed_df.Movie_Summary.str.split().str.len().mean()
median_seq_length = preprocessed_df.Movie_Summary.str.split().str.len().median()
dist = list(preprocessed_df.Movie_Summary.str.split().str.len())
plt.figure(figsize = (15,10));
plt.hist(dist, bins = np.arange(0, 2700, 200));
plt.xticks(np.arange(0, 2700, 200))
plt.yticks(np.arange(0, 25000, 1000))
plt.grid(axis = 'both')
plt.title('Word Length Distribution of dataset');
train_data, val_data, train_labels, val_labels = train_test_split(data,
                                                                  labels,
                                                                  random_state = 123,
                                                                  test_size = 0.2)
train_data = np.squeeze(train_data)
val_data = np.squeeze(val_data)
train_data = train_data.astype('str')
val_data = val_data.astype('str')
# max length of the sequences
MAX_LEN = 256

# tokenizer to be used to tokenize the strings
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case = True)

def tokenize_data(max_len, data, tokenizer):
  '''
  Tokenize data and return list of tokens (input_ids, attention_mask)
  Args:
  1) max_len : maximum length of sequences
  2) data : data to be tokenized
  3) tokenizer : tokenizer to be used
  Returns:
  1) input ids
  2) attention mask
  '''
  tokenized_data =  tokenizer.batch_encode_plus(data,max_length=max_len,
                                                add_special_tokens= True,
                                                padding='max_length',
                                                truncation=True,
                                                return_tensors = 'tf',
                                                return_attention_mask = True,
                                                return_token_type_ids = False)
  
  tokenized_data = [tokenized_data['input_ids'], tokenized_data['attention_mask']] 

  return tokenized_data
# tokenized train, val data
X_train = tokenize_data(MAX_LEN, train_data.tolist(), tokenizer)
X_val = tokenize_data(MAX_LEN, val_data.tolist(), tokenizer)
BATCH_SIZE = 16

def batch_data(data, labels,batch_size, buffer_size):
  '''
  Create and return TF dataset
  Args :
  1) data : list of tokens
  2) labels : list/array of OHE labels
  3) batch_size : size of a batch
  4) buffer_size : buffer_size for shuffling
  Returns:
  1) Tensor data that is shuffled, batched and prefetched
  '''
  dataset = tf.data.Dataset.from_tensor_slices(({'input_ids' : data[0], 'attention_mask':data[1]}, labels))
  dataset = dataset.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
  return dataset
train_dataset = batch_data(X_train, train_labels, batch_size = BATCH_SIZE, buffer_size = 50000)
validation_dataset = batch_data(X_val, val_labels, batch_size = BATCH_SIZE, buffer_size = 50000)

4. The Model

Define the model template to be used in the training

def create_model(max_length = 256):
  """
  Creates the model using Tensorflow Functional APIs
  Args:
  1) Maximum length of sequences
  Returns:
  1) Model template to be trained
  """
  bert_model = TFBertModel.from_pretrained('bert-base-uncased')
  input_ids = tf.keras.layers.Input(shape = (max_length, ), dtype = tf.int32, name = 'input_ids')
  attention_mask = tf.keras.layers.Input(shape = (max_length, ), dtype = tf.int32, name = 'attention_mask')
  x = bert_model.bert(input_ids, attention_mask)
  x = x.pooler_output
  x = tf.keras.layers.Dropout(0.2, name = 'dropout_layer_1')(x)
  x = tf.keras.layers.Dense(256, activation = 'relu', name = 'hidden_dense_layer')(x)
  x = tf.keras.layers.Dropout(0.2, name = 'dropout_layer_2')(x)
  x = tf.keras.layers.Dense(33, name = 'dense_layer_output')(x)
  out = tf.keras.layers.Activation('sigmoid', name = 'activation_layer_output')(x)
  model = tf.keras.Model(inputs = [input_ids, attention_mask], outputs = out)
  model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5),
                loss = tf.keras.losses.BinaryCrossentropy(),
                metrics = tf.metrics.BinaryAccuracy())
  return model
model = create_model()

5. Training

Train the model on data

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 1, min_lr = 1e-8)
history = model.fit(train_dataset,
                    validation_data = validation_dataset,
                    epochs = 3,
                    callbacks = [reduce_lr])

6. Checking on a stray example

Check how the model performs on a random example in the validation dataset

model.save_weights(os.path.join(PATH, 'Checkpoint/Model_Checkpoint'))
new_model.load_weights(os.path.join(PATH, 'Checkpoint/Model_Checkpoint'))
predictions = new_model.predict(validation_dataset.take(1))[0]
labels = preprocessed_df.iloc[:, 3:].columns
for i in range(len(labels)):
  if predictions[i] > 0.5:
    print(f'{labels[i]}:{predictions[i]*100}%')

7. Checking on User Input

Section to check model output on user output

MODEL_DIR = <YOUR_MODEL_DIR>
version = 1
export_path = os.path.join(MODEL_DIR, str(version))
tf.keras.models.save_model(model,
                           export_path,
                           overwrite = True,
                           include_optimizer=True,
                           save_format = 'tf',
                           signatures = None,
                           options = None)
saved_model = tf.keras.models.load_model(export_path)
data = [data]
tokenized_data = tokenize_data(MAX_LEN, data, tokenizer)
predictions = saved_model.predict({'input_ids':tokenized_data[0], 'attention_mask': tokenized_data[1]})
predictions = np.squeeze(np.asarray(predictions))
labels = preprocessed_df.iloc[:, 3:].columns
maxm = max(predictions)
for i in range(len(labels)):
  if maxm/predictions[i] <= 3:
    print(f'{labels[i]}:{predictions[i]*100:.2f}%')

action:81.47%
adventure:87.00%
animation:41.06%
fantasy:39.80%

Continue onto the next part for model deployment