Working with Tensorflow and Tensorflow datasets

Working with Tensorflow and Tensorflow datasets#

import tensorflow as tf

import tensorflow_datasets as tfds

List all the available datasets

tfds.list_builders()

Dataset Information#

We will first use tfds.builder to obtain information related to a dataset like MNIST. Take a look at the available information for this dataset, especially the available features (features) and the total number of examples (total_num_examples).

builder = tfds.builder("mnist")
print(builder.info)

Features#

builder = tfds.builder("mnist")
print(builder.info.features)

Label details#

builder = tfds.builder("mnist")

# Number of classes
print(builder.info.features["label"].num_classes)

# Class names
print(builder.info.features["label"].names)

# Get the number equiavalent to a label
print(builder.info.features["label"].str2int("8"))

# shape
print(builder.info.features.shape)

# type of label
print(builder.info.features["label"].dtype)

Features of different datasets#

Remove the break from the following code and see the available features from the different datasets.

for dataset in tfds.list_builders():
    builder = tfds.builder(dataset)
    print(
        f"Name: {{0}}\n description: {{1}}".format(
            builder.info.name, builder.info.description
        )
    )
    print(f"Name: {{0}}".format(builder.info.features))
    break

Loading a dataset#

Let’s start with loading the MNIST dataset for handwriting recognition

ds = tfds.load("mnist", split="train", shuffle_files=True, try_gcs=True)
assert isinstance(ds, tf.data.Dataset)
print(ds)

Iterate over a dataset. Each entry in the dataset has 2 parts: image of a handwritten digit and the associated label.

for example in ds:  # example is `{'image': tf.Tensor, 'label': tf.Tensor}`
    print(list(example.keys()))
    image = example["image"]
    label = example["label"]
    print(image.shape, label)
    break

Obtain a tuple

ds = tfds.load("mnist", split="train", as_supervised=True, try_gcs=True)

for image, label in ds:  # example is (image, label)
    print(label)
    break

Visualization#

Another way is to use take() and pass a number n to select n examples from the dataset. Passing with_info with True helps to create the dataframe with necessary information for the visualization. Try changing the value of with_info to False and see the errors.

ds, info = tfds.load("mnist", split="train", with_info=True, try_gcs=True)

tfds.as_dataframe(ds.take(1), info)

Change the parameter value of ds.take().

ds, info = tfds.load("mnist", split="train", with_info=True, try_gcs=True)

tfds.as_dataframe(ds.take(10), info)

Splitting datasets for training and testing#

For tasks like classification, it is important to classify the data for training and testing. There are several ways it can be done. In the following example, we display the information of the dataset after the loading of the dataset. Take a look at different information like features, splits, total_num_examples etc.

(ds_train, ds_test), info = tfds.load("mnist", split=["train", "test"], with_info=True)
print(info)

To create a training dataset from the first 80% of the training split.

ds_train, info = tfds.load("mnist", split="train[80%:]", with_info=True)

Applying modifications#

def normalize_img(image, label):
    """Normalizes images: `uint8` -> `float32`."""
    return tf.cast(image, tf.float32) / 255.0, label


(ds_train, ds_test), info = tfds.load(
    "mnist", split=["train", "test"], as_supervised=True, with_info=True
)
ds_train = ds_train.map(normalize_img)
ds_test = ds_test.map(normalize_img)

Batches#

For testing and training, it is important to create batches. Make use of batch() for creating batches of the specified size. For example, the code below will create batches of 128 samples.

(ds_train, ds_test), info = tfds.load(
    "mnist", split=["train", "test"], as_supervised=True, with_info=True
)
ds_train = ds_train.batch(128)
ds_test = ds_test.batch(128)
print(ds_train)
print(ds_test)

Building a training model#

model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Flatten(input_shape=info.features["image"].shape),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(10, activation="softmax"),
    ]
)

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=["accuracy"],
)

Model Summary#

print(model.summary())

Visualizing the model#

from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True)

Training#

history = model.fit(ds_train, epochs=10, batch_size=10, validation_data=ds_test)

History of training#

# list different data in history
for histinfo in history.history.keys():
    print(f"{histinfo}: {{0}}".format(history.history[histinfo]))

Visualizing the history

import matplotlib.pyplot as plot

plot.plot(history.history["accuracy"], label="Training")
plot.plot(history.history["val_accuracy"], label="Validation")
plot.legend(loc="upper left")
plot.ylabel("Accuracy")
plot.xlabel("Number of Epochs")
plot.title("History of Training and Validation Accuracy across epochs")

plot.plot(history.history["loss"], label="Training")
plot.plot(history.history["val_loss"], label="Validation")
plot.legend(loc="upper left")
plot.ylabel("Loss")
plot.xlabel("Number of Epochs")
plot.title("History of Training and Validation Loss across epochs")

loss, accuracy = model.evaluate(ds_test, verbose=0)
print(f"accuracy: {accuracy} and loss:{loss}")

Prediction#

# Creating a dataset for testing
ds_test = tfds.load("mnist", split="test[20%:]", as_supervised=True, shuffle_files=True)

# Creating a probability model for different classes for obtaining the probabilty
# for each class
probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])

# Creating batches
ds_test_batch = ds_test.batch(128)

# Prediction
predictions = probability_model.predict(ds_test_batch)

Obtaining the number of predictions made

print(len(predictions))

Check the probability values for second prediction

print(predictions[1])

Get the class with the highest probability

import numpy as np

print(np.argmax(predictions[1]))

Get the class with the highest probability for all the classes

predictedlabels = [np.argmax(predictions[i]) for i in range(len(predictions))]

Get the actual class or label from the test dataset.

data = ds_test.as_numpy_iterator()
testdata = list(data)
labels = [testdata[i][1] for i in range(len(testdata))]
print(labels[1])

Evaluate the prediction using a confusion matrix

confusionmatrix = tf.math.confusion_matrix(labels, predictedlabels, num_classes=10)
print(confusionmatrix)

Visualizing the confusion matrix

import seaborn as sn

sn.heatmap(confusionmatrix)