Working with Tensorflow and Tensorflow datasets#
import tensorflow as tf
import tensorflow_datasets as tfds
List all the available datasets
tfds.list_builders()
Dataset Information#
We will first use tfds.builder
to obtain information related to a dataset like MNIST. Take a look at the available information for this dataset, especially the available features (features
) and the total number of examples (total_num_examples
).
builder = tfds.builder("mnist")
print(builder.info)
Features#
builder = tfds.builder("mnist")
print(builder.info.features)
Label details#
builder = tfds.builder("mnist")
# Number of classes
print(builder.info.features["label"].num_classes)
# Class names
print(builder.info.features["label"].names)
# Get the number equiavalent to a label
print(builder.info.features["label"].str2int("8"))
# shape
print(builder.info.features.shape)
# type of label
print(builder.info.features["label"].dtype)
Features of different datasets#
Remove the break
from the following code and see the available features from the different datasets.
for dataset in tfds.list_builders():
builder = tfds.builder(dataset)
print(
f"Name: {{0}}\n description: {{1}}".format(
builder.info.name, builder.info.description
)
)
print(f"Name: {{0}}".format(builder.info.features))
break
Loading a dataset#
Let’s start with loading the MNIST dataset for handwriting recognition
ds = tfds.load("mnist", split="train", shuffle_files=True, try_gcs=True)
assert isinstance(ds, tf.data.Dataset)
print(ds)
Iterate over a dataset. Each entry in the dataset has 2 parts: image of a handwritten digit and the associated label.
for example in ds: # example is `{'image': tf.Tensor, 'label': tf.Tensor}`
print(list(example.keys()))
image = example["image"]
label = example["label"]
print(image.shape, label)
break
Obtain a tuple
ds = tfds.load("mnist", split="train", as_supervised=True, try_gcs=True)
for image, label in ds: # example is (image, label)
print(label)
break
Visualization#
Another way is to use take()
and pass a number n to select n examples from the dataset. Passing with_info
with True helps to create the dataframe with necessary information for the visualization. Try changing the value of with_info
to False and see the errors.
ds, info = tfds.load("mnist", split="train", with_info=True, try_gcs=True)
tfds.as_dataframe(ds.take(1), info)
Change the parameter value of ds.take()
.
ds, info = tfds.load("mnist", split="train", with_info=True, try_gcs=True)
tfds.as_dataframe(ds.take(10), info)
Splitting datasets for training and testing#
For tasks like classification, it is important to classify the data for training and testing. There are several ways it can be done. In the following example, we display the information of the dataset after the loading of the dataset. Take a look at different information like features, splits, total_num_examples etc.
(ds_train, ds_test), info = tfds.load("mnist", split=["train", "test"], with_info=True)
print(info)
To create a training dataset from the first 80% of the training split.
ds_train, info = tfds.load("mnist", split="train[80%:]", with_info=True)
Applying modifications#
def normalize_img(image, label):
"""Normalizes images: `uint8` -> `float32`."""
return tf.cast(image, tf.float32) / 255.0, label
(ds_train, ds_test), info = tfds.load(
"mnist", split=["train", "test"], as_supervised=True, with_info=True
)
ds_train = ds_train.map(normalize_img)
ds_test = ds_test.map(normalize_img)
Batches#
For testing and training, it is important to create batches. Make use of batch()
for creating batches of the specified size. For example, the code below will create batches of 128 samples.
(ds_train, ds_test), info = tfds.load(
"mnist", split=["train", "test"], as_supervised=True, with_info=True
)
ds_train = ds_train.batch(128)
ds_test = ds_test.batch(128)
print(ds_train)
print(ds_test)
Building a training model#
model = tf.keras.models.Sequential(
[
tf.keras.layers.Flatten(input_shape=info.features["image"].shape),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(10, activation="softmax"),
]
)
model.compile(
loss="sparse_categorical_crossentropy",
optimizer=tf.keras.optimizers.Adam(0.001),
metrics=["accuracy"],
)
Model Summary#
print(model.summary())
Visualizing the model#
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True)
Training#
history = model.fit(ds_train, epochs=10, batch_size=10, validation_data=ds_test)
History of training#
# list different data in history
for histinfo in history.history.keys():
print(f"{histinfo}: {{0}}".format(history.history[histinfo]))
Visualizing the history
import matplotlib.pyplot as plot
plot.plot(history.history["accuracy"], label="Training")
plot.plot(history.history["val_accuracy"], label="Validation")
plot.legend(loc="upper left")
plot.ylabel("Accuracy")
plot.xlabel("Number of Epochs")
plot.title("History of Training and Validation Accuracy across epochs")
plot.plot(history.history["loss"], label="Training")
plot.plot(history.history["val_loss"], label="Validation")
plot.legend(loc="upper left")
plot.ylabel("Loss")
plot.xlabel("Number of Epochs")
plot.title("History of Training and Validation Loss across epochs")
loss, accuracy = model.evaluate(ds_test, verbose=0)
print(f"accuracy: {accuracy} and loss:{loss}")
Prediction#
# Creating a dataset for testing
ds_test = tfds.load("mnist", split="test[20%:]", as_supervised=True, shuffle_files=True)
# Creating a probability model for different classes for obtaining the probabilty
# for each class
probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])
# Creating batches
ds_test_batch = ds_test.batch(128)
# Prediction
predictions = probability_model.predict(ds_test_batch)
Obtaining the number of predictions made
print(len(predictions))
Check the probability values for second prediction
print(predictions[1])
Get the class with the highest probability
import numpy as np
print(np.argmax(predictions[1]))
Get the class with the highest probability for all the classes
predictedlabels = [np.argmax(predictions[i]) for i in range(len(predictions))]
Get the actual class or label from the test dataset.
data = ds_test.as_numpy_iterator()
testdata = list(data)
labels = [testdata[i][1] for i in range(len(testdata))]
print(labels[1])
Evaluate the prediction using a confusion matrix
confusionmatrix = tf.math.confusion_matrix(labels, predictedlabels, num_classes=10)
print(confusionmatrix)
Visualizing the confusion matrix
import seaborn as sn
sn.heatmap(confusionmatrix)