Turn a CNN into an object classifier
Turn a CNN into an object classifier. This follows a tutorial by.
import imutils
import numpy as np
import cv2
import matplotlib.pyplot as plt
import argparse
import tensorflow as tf
import imutils
from imutils.object_detection import non_max_suppression
import time
print(tf.__version__, cv2.__version__)
class_cat = cv2.imread("images/class_cat.jpg")
class_cat = cv2.cvtColor(class_cat, cv2.COLOR_BGR2RGB)
obj_cat = cv2.imread("images/obj_cat.jpg")
obj_cat = cv2.cvtColor(obj_cat, cv2.COLOR_BGR2RGB)
fig = plt.figure(figsize = (10,10))
images = ("Classification", class_cat), ("Object Detection", obj_cat)
# loop over the images
for (i, (name, image)) in enumerate(images):
# show the image
ax = fig.add_subplot(1, 2, i + 1)
ax.set_title(name)
plt.imshow(image)
plt.axis("off")
2. Object detection algorithm pattern
-
Input: an image that we wish to apply object detection to
-
Output: has three values:
2a. A list of bounding boxes, or the (x, y)-coordinates for each object in image
2b. The class label associated with each of the bounding boxes
2c. The probability/confidence score associated with each bounding box and class label
orig_cat = cv2.imread("images/keras-detection/dlui03.jpg")
orig_cat = cv2.cvtColor(orig_cat, cv2.COLOR_BGR2RGB )
class_cat = orig_cat.copy()
cv2.putText(class_cat, "{:.2f}".format(0.80), (100, 200),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.putText(class_cat, "cat", (100, 100),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
plt.imshow(class_cat)
plt.savefig("images/keras-detection/class_cat.jpg")
obj_cat = orig_cat.copy()
cv2.putText(obj_cat, "{:.2f}".format(0.70), (100, 200),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.putText(obj_cat, "cat", (100, 100),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.rectangle(obj_cat, (500,10), (950, 600), (255, 0, 0), 6)
#obj_cat2 = orig_cat.copy()
cv2.putText(obj_cat, "{:.2f}".format(0.70), (750, 1500),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.putText(obj_cat, "cat", (750, 1400),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 6)
cv2.rectangle(obj_cat, (250,1000), (600, 1400), (255, 0, 0), 6)
plt.imshow(obj_cat)
plt.savefig("images/keras-detection/obj_cat.jpg")
fig = plt.figure(figsize = (10,10))
images = ("Classification", class_cat), ("Object Detection", obj_cat)
# loop over the images
for (i, (name, image)) in enumerate(images):
# show the image
ax = fig.add_subplot(1, 2, i + 1)
ax.set_title(name)
plt.imshow(image)
plt.axis("off")
2. Turn any classifier into an object detector
- Before ANN-CNN era state of the for object detection was HOG(Histogarm of Oriented Gardients) + SVM
-
This tutorial combines several approaches:
- Image pyramids: Localize objects at different scales/sizes (is a multi-scale representation of an image)
- Sliding windows: Detect exactly where in the image a given object is.
- Non-maxima suppression: Collapse weak, overlapping bounding box
import imutils
import numpy as np
import cv2
import matplotlib.pyplot as plt
import argparse
import tensorflow as tf
import imutils
from imutils.object_detection import non_max_suppression
import time
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", default = "images/keras-detection/dlui03.jpg", #required=False,
help="path to the input image")
ap.add_argument("-s", "--size", type=str, default="(200, 150)",
help="ROI size (in pixels)")
ap.add_argument("-c", "--min-conf", type=float, default=0.7,
help="minimum probability to filter weak detections")
ap.add_argument("-v", "--visualize", type=int, default=1,
help="whether or not to show extra visualizations for debugging")
args = vars(ap.parse_args([]))
image = cv2.imread(args["image"])
#print(image.shape)
# resize image keeping aspect ratio
#r = 224 / image.shape[1] # ratio of new width /old width
#dim = (224, int(image.shape[0] * r)) # resized height
#image = cv2.resize(image, dim)
# move to RGB map
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# display image
fig = plt.figure(figsize = (5, 5))
ax = fig.add_subplot(111)
ax.set_title("cat: dlui")
plt.imshow(image)
plt.axis("off")
plt.colorbar()

def sliding_window(image, step, ws):
#slide a window of ws size over the image
for y in range(0, image.shape[0]-ws[1], step): # rows-wise loop
# -ws[1] avoids extending the sliding window outside the image itself, increment the y-position with step
for x in range(0, image.shape[1] - ws[0], step):#columns-wise loop, increment the x-position with step
# use yield(instead of return) because this is a generator
#yield the actual x and y positions and the current window
yield (x, y, image[y:y + ws[1], x:x + ws[0]])
4. Construct a pyramid generator
The image_pyramid function is constructed as a generator.
At the bottom of the pyramid, we have the original image at its original size (in terms of width and height). At each subsequent layer, the image is resized (subsampled) by a scaling factor and optionally smoothed (usually via Gaussian blurring). The image is progressively subsampled(adding pyramids) until some stopping criterion is met, which is normally when a minimum size has been reached(smaller than the sliding window size).
Image resizing take place in two steps:
-
resize by scale - to construct the next layer in the pyramid (size of image is drastically reduced)
-
resize to keep image aspect-ratio (of the image in that pyramid layer, size the image will be slightly fit)
def image_pyramid(image, scale=1.5, minSize=(224, 224)):
# yield the original image, this is the base of the image pyramid
yield image
# keep looping over the image pyramid
while True:
# compute the dimensions of the next image in the pyramid
#scale controls how much the image is resized at each layer
w = int(image.shape[1] / scale)
# resize the image and take care of image aspect-ratio
image = imutils.resize(image, width=w)
# if the resized image does not meet the supplied minimum
# size, then stop constructing the pyramid
if image.shape[0] < minSize[1] or image.shape[1] < minSize[0]:
break
# yield the next image in the pyramid
yield image
WIDTH = 600 #
PYR_SCALE = 1.5
WIN_STEP = 16*3 # running on laptop so I generated a small pyramid
ROI_SIZE = eval(args["size"])
INPUT_SIZE = (224, 224) # input of resnet model.summary()
5. Use ResNet trained with ImageNet for object detection
-
Load the pretrained model (any model). Have a look on what was images was trained on, and if its classification task is transferable to your classification task. Include top layer.
-
Resize the images (size and aspect-ratio) to fit the size in the InputLayer in CNN.
print("[INFO] loading network...")
model = tf.keras.applications.resnet.ResNet50(weights = "imagenet", include_top = "True")
print("...Done")
orig = image
orig = imutils.resize(orig, width = WIDTH)
(H, W) = orig.shape[:2] # 800, 600
6. Classification of ROIs
For each level in the pyramid run the sliding window. For each stop of the sliding window extract the window part of that image (ROI). Take the ROI and pass it trough the pre-trainied classifier. Look at the classification results, if fpr that ROI a classification result is greater than a minimum threshold, then record the class label and the position of the ROI/window in the original file name.
pyramid = image_pyramid(orig, scale=PYR_SCALE, minSize=ROI_SIZE)
# initialize two lists, one to hold the ROIs generated from the image pyramid
#and sliding window, and another list used to store the
# (x, y)-coordinates of where the ROI was in the original image
rois = []
locs = []
# time how long it takes to loop over the image pyramid layers and
# sliding window locations
start = time.time()
counter = 0
tot_images = 0
for p, image in enumerate(pyramid):
# determine the scale factor between the *original* image
# dimensions and the *current* layer of the pyramid
scale = W / float(image.shape[1])
# for each layer of the image pyramid, loop over the sliding
# window locations
sw = 0
for (x, y, roiOrig) in sliding_window(image, WIN_STEP, ROI_SIZE):
sw = sw + 1
# scale the (x, y)-coordinates of the ROI with respect to the
# *original* image dimensions
x = int(x * scale)
y = int(y * scale)
w = int(ROI_SIZE[0] * scale)
h = int(ROI_SIZE[1] * scale)
# take the ROI and pre-process it so we can later classify
# the region using Keras/TensorFlow
roi = cv2.resize(roiOrig, INPUT_SIZE)
roi = tf.keras.preprocessing.image.img_to_array(roi)
roi = tf.keras.applications.resnet.preprocess_input(roi)
#print(roiOrig.shape, roi.shape)
# update our list of ROIs and associated coordinates
rois.append(roi)
locs.append((x, y, x + w, y + h))
# check to see if we are visualizing each of the sliding
# windows in the image pyramid
if args["visualize"] > 0:
# clone the original image and then draw a bounding box
# surrounding the current region
clone = orig.copy()
cv2.rectangle(clone, (x, y), (x + w, y + h),(0, 255, 0), 5)
# show the visualization and current ROI
#plt.imshow(clone)
#var_name = "p" + str(p)+"_" + "sw" + str(sw) + ".jpg"
#plt.savefig("images/clone_"+ var_name)
#plt.imshow(roiOrig)
#plt.savefig("images/roiOrig_"+ var_name)
#cv2.waitKey(0)
tot_images = tot_images +1
print(roiOrig.shape, roi.shape)
# show how long it took to loop over the image pyramid layers and
# sliding window locations
end = time.time()
print("[INFO] looping over pyramid/windows took {:.5f} seconds".format(end - start))
print("Total images {:.2f}".format(tot_images))
rois = np.array(rois, dtype="float32")
# classify each of the proposal ROIs using ResNet and then show how
# long the classifications took
print("[INFO] classifying ROIs...")
start = time.time()
my_preds = model.predict(rois)
end = time.time()
print("[INFO] classifying ROIs took {:.5f} seconds".format(end - start))
preds = tf.keras.applications.imagenet_utils.decode_predictions(my_preds, top=1)
preds[30:35]
# labels (keys) to any ROIs associated with that label (values)
#preds = tf.keras.applications.imagenet_utils.decode_predictions(my_preds, top=1)
labels = {}
#probs = {}
# loop over the predictions
for (i, p) in enumerate(preds):
# grab the prediction information for the current ROI
(imagenetID, label, prob) = p[0]
# filter out weak detections by ensuring the predicted probability
# is greater than the minimum probability
if prob >= args["min_conf"]:
# grab the bounding box associated with the prediction and
# convert the coordinates
box = locs[i]
# grab the list of predictions for the label and add the
# bounding box and probability to the list
L = labels.get(label, [])
L.append((box, prob))
labels[label] = L
allclone = orig.copy()
for label in labels.keys():
# clone the original image so that we can draw on it
print("[INFO] showing results for '{}'".format(label))
clone = orig.copy()
# loop over all bounding boxes for the current label
for (box, prob) in labels[label]:
# draw the bounding box on the image
(startX, startY, endX, endY) = box
cv2.rectangle(clone, (startX, startY), (endX, endY),(0, 255, 0), 2)
# show the results *before* applying non-maxima suppression, then
# clone the image again so we can display the results *after*
# applying non-maxima suppression
#plt.imshow(clone)
#cv2.imshow("Before", clone)
clone = orig.copy()
# extract the bounding boxes and associated prediction
# probabilities, then apply non-maxima suppression
boxes = np.array([p[0] for p in labels[label]])
proba = np.array([p[1] for p in labels[label]])
boxes = non_max_suppression(boxes, proba)
# loop over all bounding boxes that were kept after applying
# non-maxima suppression
for (startX, startY, endX, endY) in boxes:
# draw the bounding box and label on the image
cv2.rectangle(clone, (startX, startY), (endX, endY),(0, 255, 0), 2)
cv2.rectangle(allclone, (startX, startY), (endX, endY),(0, 255, 0), 2)
y = startY - 10 if startY - 10 > 10 else startY + 10
cv2.putText(clone, label, (startX, y+30),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.putText(clone, "{:.2f}".format(prob), (startX, y),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.putText(allclone, label, (startX, y+30),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.putText(allclone, "{:.2f}".format(prob), (startX, y),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
# show the output after apply non-maxima suppression
plt.imshow(clone)
plt.imsave("images/keras_detection/_res03_" + label + ".jpg", clone)
plt.imshow(allclone)
plt.imsave("images/keras_detection/_allclone03.jpg", allclone)
#plt.imshow(clone)
#plt.imsave("images/_res03.jpg", clone)
#cv2.imshow("After", clone)
#cv2.waitKey(0)
7. The general flow of the algorithm
- Input an image
- Construct an image pyramid
-
For each scale of the image pyramid, run a sliding window
3a. For each stop of the sliding window, extract the ROIs
3b. Take the ROIs and pass it through our CNN originally trained for image classification
3c. Examine the probability of the top class label of the CNN, and if meets a minimum confidence, record (1) the class label and (2) the location of the sliding window
-
Apply class-wise non-maxima suppression to the bounding boxes
- Return results to calling function
