1. Classification vs Object Detection

Classification:

One image in
One class label out

Object detection:

Input one image
Obtain multiple bounding boxes and class labels as output

import imutils
import numpy as np
import cv2
import matplotlib.pyplot as plt
import argparse
import tensorflow as tf
import imutils
from imutils.object_detection import non_max_suppression
import time
print(tf.__version__, cv2.__version__)

2.3.0 4.4.0

class_cat = cv2.imread("images/class_cat.jpg")
class_cat = cv2.cvtColor(class_cat, cv2.COLOR_BGR2RGB)
obj_cat = cv2.imread("images/obj_cat.jpg")
obj_cat = cv2.cvtColor(obj_cat, cv2.COLOR_BGR2RGB)

fig = plt.figure(figsize = (10,10))
images = ("Classification", class_cat), ("Object Detection", obj_cat)
# loop over the images
for (i, (name, image)) in enumerate(images):
    # show the image
    ax = fig.add_subplot(1, 2, i + 1)
    ax.set_title(name)
    plt.imshow(image)
    plt.axis("off")

2. Object detection algorithm pattern

Input: an image that we wish to apply object detection to
Output: has three values:

2a. A list of bounding boxes, or the (x, y)-coordinates for each object in image

2b. The class label associated with each of the bounding boxes

2c. The probability/confidence score associated with each bounding box and class label

orig_cat = cv2.imread("images/keras-detection/dlui03.jpg")
orig_cat = cv2.cvtColor(orig_cat, cv2.COLOR_BGR2RGB )
class_cat = orig_cat.copy()
cv2.putText(class_cat, "{:.2f}".format(0.80), (100, 200),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.putText(class_cat, "cat", (100, 100),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
plt.imshow(class_cat)
plt.savefig("images/keras-detection/class_cat.jpg")

obj_cat = orig_cat.copy()
cv2.putText(obj_cat, "{:.2f}".format(0.70), (100, 200),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.putText(obj_cat, "cat", (100, 100),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.rectangle(obj_cat, (500,10), (950, 600), (255, 0, 0), 6)

#obj_cat2 = orig_cat.copy()
cv2.putText(obj_cat, "{:.2f}".format(0.70), (750, 1500),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.putText(obj_cat, "cat", (750, 1400),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.rectangle(obj_cat, (250,1000), (600, 1400), (255, 0, 0), 6)
plt.imshow(obj_cat)
plt.savefig("images/keras-detection/obj_cat.jpg")

fig = plt.figure(figsize = (10,10))
images = ("Classification", class_cat), ("Object Detection", obj_cat)
# loop over the images
for (i, (name, image)) in enumerate(images):
    # show the image
    ax = fig.add_subplot(1, 2, i + 1)
    ax.set_title(name)
    plt.imshow(image)
    plt.axis("off")

2. Turn any classifier into an object detector

Before ANN-CNN era state of the for object detection was HOG(Histogarm of Oriented Gardients) + SVM
This tutorial combines several approaches:
1. Image pyramids: Localize objects at different scales/sizes (is a multi-scale representation of an image)
2. Sliding windows: Detect exactly where in the image a given object is.
3. Non-maxima suppression: Collapse weak, overlapping bounding box

import imutils
import numpy as np
import cv2
import matplotlib.pyplot as plt
import argparse
import tensorflow as tf
import imutils
from imutils.object_detection import non_max_suppression
import time

ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", default = "images/keras-detection/dlui03.jpg", #required=False,
    help="path to the input image")
ap.add_argument("-s", "--size", type=str, default="(200, 150)",
    help="ROI size (in pixels)")
ap.add_argument("-c", "--min-conf", type=float, default=0.7,
    help="minimum probability to filter weak detections")
ap.add_argument("-v", "--visualize", type=int, default=1,
    help="whether or not to show extra visualizations for debugging")
args = vars(ap.parse_args([]))

image = cv2.imread(args["image"])
#print(image.shape)
# resize image keeping aspect ratio
#r = 224 / image.shape[1] # ratio of new width /old width
#dim = (224, int(image.shape[0] * r)) # resized height
#image = cv2.resize(image, dim)
# move to RGB map
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# display image
fig = plt.figure(figsize = (5, 5))
ax = fig.add_subplot(111)
ax.set_title("cat: dlui")
plt.imshow(image)
plt.axis("off")
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x7fcd5c1b5c50>

3. Construct a sliding window generator

The sliding function is constructed as a generator.

Sliding window and ROIs for an image with three hierachical pyramids.

def sliding_window(image, step, ws):
    #slide a window of ws size over the image
    for y in range(0, image.shape[0]-ws[1], step): # rows-wise loop
        # -ws[1] avoids extending the sliding window outside the image itself, increment the y-position with step
        for x in range(0, image.shape[1] - ws[0], step):#columns-wise loop, increment the x-position with step
            # use yield(instead of return) because this is a generator
            #yield the actual x and y positions and the current window
            yield (x, y, image[y:y + ws[1], x:x + ws[0]])

4. Construct a pyramid generator

The image_pyramid function is constructed as a generator.

At the bottom of the pyramid, we have the original image at its original size (in terms of width and height). At each subsequent layer, the image is resized (subsampled) by a scaling factor and optionally smoothed (usually via Gaussian blurring). The image is progressively subsampled(adding pyramids) until some stopping criterion is met, which is normally when a minimum size has been reached(smaller than the sliding window size).

Image resizing take place in two steps:

resize by scale - to construct the next layer in the pyramid (size of image is drastically reduced)
resize to keep image aspect-ratio (of the image in that pyramid layer, size the image will be slightly fit)

def image_pyramid(image, scale=1.5, minSize=(224, 224)):
    # yield the original image, this is the base of the image pyramid
    yield image
    # keep looping over the image pyramid
    while True:
        # compute the dimensions of the next image in the pyramid
        #scale controls how much the image is resized at each layer
        w = int(image.shape[1] / scale)
        # resize the image and take care of image aspect-ratio
        image = imutils.resize(image, width=w) 
        # if the resized image does not meet the supplied minimum
        # size, then stop constructing the pyramid
        if image.shape[0] < minSize[1] or image.shape[1] < minSize[0]:
            break
        # yield the next image in the pyramid
        yield image

WIDTH = 600 # 
PYR_SCALE = 1.5
WIN_STEP = 16*3 # running on laptop so I generated a small pyramid
ROI_SIZE = eval(args["size"])
INPUT_SIZE = (224, 224) # input of resnet model.summary()

5. Use ResNet trained with ImageNet for object detection

Load the pretrained model (any model). Have a look on what was images was trained on, and if its classification task is transferable to your classification task. Include top layer.
Resize the images (size and aspect-ratio) to fit the size in the InputLayer in CNN.

print("[INFO] loading network...")
model = tf.keras.applications.resnet.ResNet50(weights = "imagenet", include_top = "True")
print("...Done")

[INFO] loading network...
...Done

orig = image
orig = imutils.resize(orig, width = WIDTH)
(H, W) = orig.shape[:2] # 800, 600

6. Classification of ROIs

For each level in the pyramid run the sliding window. For each stop of the sliding window extract the window part of that image (ROI). Take the ROI and pass it trough the pre-trainied classifier. Look at the classification results, if fpr that ROI a classification result is greater than a minimum threshold, then record the class label and the position of the ROI/window in the original file name.

pyramid = image_pyramid(orig, scale=PYR_SCALE, minSize=ROI_SIZE)
# initialize two lists, one to hold the ROIs generated from the image pyramid 
#and sliding window, and another list used to store the
# (x, y)-coordinates of where the ROI was in the original image
rois = []
locs = []
# time how long it takes to loop over the image pyramid layers and
# sliding window locations
start = time.time()
counter = 0
tot_images = 0
for p, image in enumerate(pyramid):
    # determine the scale factor between the *original* image
    # dimensions and the *current* layer of the pyramid
    scale = W / float(image.shape[1])
    # for each layer of the image pyramid, loop over the sliding
    # window locations
    sw = 0
    for (x, y, roiOrig) in sliding_window(image, WIN_STEP, ROI_SIZE):
        sw = sw + 1
        # scale the (x, y)-coordinates of the ROI with respect to the
        # *original* image dimensions
        x = int(x * scale)
        y = int(y * scale)
        w = int(ROI_SIZE[0] * scale)
        h = int(ROI_SIZE[1] * scale)
        # take the ROI and pre-process it so we can later classify
        # the region using Keras/TensorFlow
        roi = cv2.resize(roiOrig, INPUT_SIZE)
        roi = tf.keras.preprocessing.image.img_to_array(roi)
        roi = tf.keras.applications.resnet.preprocess_input(roi)
        #print(roiOrig.shape, roi.shape)
        # update our list of ROIs and associated coordinates
        rois.append(roi)
        locs.append((x, y, x + w, y + h))
        # check to see if we are visualizing each of the sliding
        # windows in the image pyramid
        if args["visualize"] > 0:
            # clone the original image and then draw a bounding box
            # surrounding the current region
            clone = orig.copy()
            cv2.rectangle(clone, (x, y), (x + w, y + h),(0, 255, 0), 5)
            # show the visualization and current ROI
            #plt.imshow(clone)
            #var_name = "p" + str(p)+"_" + "sw" + str(sw) + ".jpg"
            #plt.savefig("images/clone_"+ var_name)
            #plt.imshow(roiOrig)
            #plt.savefig("images/roiOrig_"+ var_name)
            #cv2.waitKey(0)
            tot_images = tot_images +1
print(roiOrig.shape, roi.shape)
# show how long it took to loop over the image pyramid layers and
# sliding window locations
end = time.time()
print("[INFO] looping over pyramid/windows took {:.5f} seconds".format(end - start))
print("Total images {:.2f}".format(tot_images))

(150, 200, 3) (224, 224, 3)
[INFO] looping over pyramid/windows took 0.21334 seconds
Total images 176.00

rois = np.array(rois, dtype="float32")
# classify each of the proposal ROIs using ResNet and then show how
# long the classifications took
print("[INFO] classifying ROIs...")
start = time.time()
my_preds = model.predict(rois)
end = time.time()
print("[INFO] classifying ROIs took {:.5f} seconds".format(end - start))

[INFO] classifying ROIs...
[INFO] classifying ROIs took 48.74360 seconds

preds = tf.keras.applications.imagenet_utils.decode_predictions(my_preds, top=1)
preds[30:35]

[[('n02085782', 'Japanese_spaniel', 0.33089334)],
 [('n02124075', 'Egyptian_cat', 0.6837845)],
 [('n02124075', 'Egyptian_cat', 0.75716674)],
 [('n02124075', 'Egyptian_cat', 0.43293664)],
 [('n02124075', 'Egyptian_cat', 0.6156667)]]

# labels (keys) to any ROIs associated with that label (values)
#preds = tf.keras.applications.imagenet_utils.decode_predictions(my_preds, top=1)
labels = {}
#probs = {}
# loop over the predictions
for (i, p) in enumerate(preds):
    # grab the prediction information for the current ROI
    (imagenetID, label, prob) = p[0]

    # filter out weak detections by ensuring the predicted probability
    # is greater than the minimum probability
    if prob >= args["min_conf"]:
        # grab the bounding box associated with the prediction and
        # convert the coordinates
        box = locs[i]

        # grab the list of predictions for the label and add the
        # bounding box and probability to the list
        L = labels.get(label, [])
        L.append((box, prob))
        labels[label] = L

allclone = orig.copy()
for label in labels.keys():
    # clone the original image so that we can draw on it
    print("[INFO] showing results for '{}'".format(label))
    clone = orig.copy()

    # loop over all bounding boxes for the current label
    for (box, prob) in labels[label]:
        # draw the bounding box on the image
        (startX, startY, endX, endY) = box
        cv2.rectangle(clone, (startX, startY), (endX, endY),(0, 255, 0), 2)

    # show the results *before* applying non-maxima suppression, then
    # clone the image again so we can display the results *after*
    # applying non-maxima suppression
    #plt.imshow(clone)
    #cv2.imshow("Before", clone)
    clone = orig.copy()

    # extract the bounding boxes and associated prediction
    # probabilities, then apply non-maxima suppression
    boxes = np.array([p[0] for p in labels[label]])
    proba = np.array([p[1] for p in labels[label]])
    boxes = non_max_suppression(boxes, proba)

    # loop over all bounding boxes that were kept after applying
    # non-maxima suppression
    for (startX, startY, endX, endY) in boxes:
        # draw the bounding box and label on the image
        cv2.rectangle(clone, (startX, startY), (endX, endY),(0, 255, 0), 2)
        cv2.rectangle(allclone, (startX, startY), (endX, endY),(0, 255, 0), 2)
        y = startY - 10 if startY - 10 > 10 else startY + 10
        cv2.putText(clone, label, (startX, y+30),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.putText(clone, "{:.2f}".format(prob), (startX, y),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.putText(allclone, label, (startX, y+30),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.putText(allclone, "{:.2f}".format(prob), (startX, y),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)        
    # show the output after apply non-maxima suppression
    plt.imshow(clone)
    plt.imsave("images/keras_detection/_res03_" + label + ".jpg", clone) 
plt.imshow(allclone)
plt.imsave("images/keras_detection/_allclone03.jpg", allclone)     
    #plt.imshow(clone)
    #plt.imsave("images/_res03.jpg", clone)
    #cv2.imshow("After", clone)
    #cv2.waitKey(0)

[INFO] showing results for 'cowboy_boot'
[INFO] showing results for 'Egyptian_cat'
[INFO] showing results for 'guillotine'
[INFO] showing results for 'Arabian_camel'

7. The general flow of the algorithm

Input an image
Construct an image pyramid
For each scale of the image pyramid, run a sliding window

3a. For each stop of the sliding window, extract the ROIs

3b. Take the ROIs and pass it through our CNN originally trained for image classification

3c. Examine the probability of the top class label of the CNN, and if meets a minimum confidence, record (1) the class label and (2) the location of the sliding window
Apply class-wise non-maxima suppression to the bounding boxes
Return results to calling function

References:

https https://www.pyimagesearch.com/2020/06/22/turning-any-cnn-image-classifier-into-an-object-detector-with-keras-tensorflow-and-opencv/> https https://www.pyimagesearch.com/2015/03/23/sliding-windows-for-object-detection-with-python-and-opencv/