This guide will walk you through the process of creating a chatbot using Transformers, from data collection to model deployment.
Step 1: Collect and Prepare the Dataset
1.1 Collecting Data
To train a chatbot, you need a dataset of conversational examples. Public datasets like the Cornell Movie Dialogues dataset are good starting points. This dataset contains conversations extracted from movie scripts and can be downloaded here.
1.2 Preprocessing the Data
The dataset needs to be cleaned and tokenized before feeding it into the Transformer model. The text must be converted into sequences of tokens, and padding should be applied to ensure uniform input lengths.
import tensorflow as tf
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Load dataset
def load_dataset(path):
lines = open(path, encoding='utf-8', errors='ignore').read().split('\n')
conversations = []
for line in lines:
if line:
conversations.append(line.split(' +++$+++ ')[-1])
return conversations
# Preprocess text
def preprocess_text(text):
text = text.lower().strip()
text = re.sub(r"([?.!,¿])", r" \1 ", text)
text = re.sub(r'[" "]+', " ", text)
text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
return text
# Prepare data
def prepare_data(conversations):
input_texts = []
target_texts = []
for i in range(len(conversations) - 1):
input_texts.append(preprocess_text(conversations[i]))
target_texts.append(preprocess_text(conversations[i + 1]))
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(input_texts + target_texts)
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)
max_len = max([len(seq) for seq in input_sequences + target_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_len, padding='post')
return input_sequences, target_sequences, tokenizer, max_len
conversations = load_dataset('cornell_movie_dialogs_corpus.txt')
input_sequences, target_sequences, tokenizer, max_len = prepare_data(conversations)
Step 2: Build the Transformer Neural Network
2.1 Define the Transformer Model
Transformers rely on attention mechanisms instead of recurrence to process input sequences. They consist of an encoder and a decoder, where the encoder processes the input text, and the decoder generates the response.
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, LayerNormalization, Dropout
from tensorflow.keras.models import Model
import numpy as np
# Positional Encoding
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
return pos * angle_rates
def positional_encoding(position, d_model):
angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
# Apply sin to even indices in the array; 2i
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
# Apply cos to odd indices in the array; 2i+1
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
# Scaled Dot-Product Attention
def scaled_dot_product_attention(q, k, v, mask):
matmul_qk = tf.matmul(q, k, transpose_b=True)
# Scale matmul_qk
dk = tf.cast(tf.shape(k)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
# Add the mask to the scaled tensor.
if mask is not None:
scaled_attention_logits += (mask * -1e9)
# Softmax on the last axis
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
output = tf.matmul(attention_weights, v)
return output, attention_weights
# Multi-Head Attention
class MultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
assert d_model % self.num_heads == 0
self.depth = d_model // self.num_heads
self.wq = tf.keras.layers.Dense(d_model)
self.wk = tf.keras.layers.Dense(d_model)
self.wv = tf.keras.layers.Dense(d_model)
self.dense = tf.keras.layers.Dense(d_model)
def split_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q, mask):
batch_size = tf.shape(q)[0]
q = self.wq(q)
k = self.wk(k)
v = self.wv(v)
q = self.split_heads(q, batch_size)
k = self.split_heads(k, batch_size)
v = self.split_heads(v, batch_size)
scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
output = self.dense(concat_attention)
return output, attention_weights
# Transformer Encoder Layer
class TransformerEncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(TransformerEncoderLayer, self).__init__()
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu'),
tf.keras.layers.Dense(d_model)
])
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
def call(self, x, training, mask):
attn_output, _ = self.mha(x, x, x, mask)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output)
return out2
# Transformer Decoder Layer
class TransformerDecoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(TransformerDecoderLayer, self).__init__()
self.mha1 = MultiHeadAttention(d_model, num_heads)
self.mha2 = MultiHeadAttention(d_model, num_heads)
self.ffn = tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu'),
tf.keras.layers.Dense(d_model)
])
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.layernorm3 = LayerNormalization(epsilon=1e-6)
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
self.dropout3 = Dropout(rate)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
attn1 = self.dropout1(attn1, training=training)
out1 = self.layernorm1(attn1 + x)
attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
attn2 = self.dropout2(attn2, training=training)
out2 = self.layernorm2(attn2 + out1)
ffn_output = self.ffn(out2)
ffn_output = self.dropout3(ffn_output, training=training)
out3 = self.layernorm3(ffn_output + out2)
return out3, attn_weights_block1, attn_weights_block2
# Full Transformer Model
class Transformer(tf.keras.Model):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
super(Transformer, self).__init__()
self.encoder = [TransformerEncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
self.decoder = [TransformerDecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
self.final_layer = tf.keras.layers.Dense(target_vocab_size)
def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
enc_output = inp
for i in range(len(self.encoder)):
enc_output = self.encoder[i](enc_output, training, enc_padding_mask)
dec_output = tar
for i in range(len(self.decoder)):
dec_output, _, _ = self.decoder[i](dec_output, enc_output, training, look_ahead_mask, dec_padding_mask)
final_output = self.final_layer(dec_output)
return final_output
# Hyperparameters
d_model = 512
dff = 2048
num_heads = 8
num_layers = 4
dropout_rate = 0.1
# Instantiate the Transformer
transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size=len(tokenizer.word_index) + 1,
target_vocab_size=len(tokenizer.word_index) + 1,
pe_input=max_len, pe_target=max_len, rate=dropout_rate)
Step 3: Train the Transformer Neural Network
Training a Transformer Neural Network involves feeding the input and target sequences into the model and optimizing the parameters using a loss function and an optimizer.
# Loss and Optimizer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
mask = tf.math.logical_not(tf.math.equal(real, 0))
loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=loss_.dtype)
loss_ *= mask
return tf.reduce_mean(loss_)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
# Training Step
@tf.function
def train_step(inp, tar):
tar_inp = tar[:, :-1]
tar_real = tar[:, 1:]
with tf.GradientTape() as tape:
predictions = transformer(inp, tar_inp, True, None, None, None)
loss = loss_function(tar_real, predictions)
gradients = tape.gradient(loss, transformer.trainable_variables)
optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
return loss
# Training Loop
EPOCHS = 10
for epoch in range(EPOCHS):
total_loss = 0
for batch, (inp, tar) in enumerate(zip(input_sequences, target_sequences)):
batch_loss = train_step(inp, tar)
total_loss += batch_loss
print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss.numpy():.4f}')
Step 4: Evaluate and Deploy the Model
After training, you can evaluate the chatbot using metrics like BLEU or ROUGE and then deploy it in a production environment using TensorFlow Serving or a web framework like Flask or Django, we are experts on developing and deploying with Django.
# Function to evaluate the model (for generating responses)
def evaluate(input_sentence):
input_sentence = preprocess_text(input_sentence)
input_sequence = tokenizer.texts_to_sequences([input_sentence])
input_sequence = pad_sequences(input_sequence, maxlen=max_len, padding='post')
output_sequence = np.zeros((1, max_len))
output_sequence[0, 0] = tokenizer.word_index['startseq']
for i in range(1, max_len):
predictions = transformer(input_sequence, output_sequence, False, None, None, None)
predicted_id = tf.argmax(predictions[0, i-1]).numpy()
if tokenizer.index_word[predicted_id] == 'endseq':
break
output_sequence[0, i] = predicted_id
decoded_output = ' '.join([tokenizer.index_word[idx] for idx in output_sequence[0] if idx > 0])
return decoded_output
# Example usage
input_sentence = "How are you?"
response = evaluate(input_sentence)
print(f'Bot: {response}')
Conclusion
Building a chatbot using TensorFlow and a Transformer architecture involves several key steps, including data preprocessing, model design, training, and evaluation. The Transformer architecture, with its self-attention mechanism and parallel processing capabilities, is highly effective for natural language processing tasks. By following this guide, you can create a sophisticated chatbot capable of understanding and generating human-like responses, paving the way for advanced conversational AI applications.