279 lines
10 KiB
Python
279 lines
10 KiB
Python
from GestureRecognition.handrecogniser import HandRecogniser
|
|
import numpy as np
|
|
import cv2
|
|
import tensorflow as tf
|
|
|
|
class SimpleHandRecogniser(HandRecogniser):
|
|
def __init__(self, frame):
|
|
self.img = frame
|
|
self.graph = None
|
|
self.sess = None
|
|
|
|
def __calc_pos_y(self, x, radius, centre):
|
|
"""
|
|
Calculates the position of y on a given circle radius and centre, given coordinate x.
|
|
"""
|
|
return int((radius**2 - (x - centre[0])**2)**(1/2) + centre[1])
|
|
|
|
def __segment_image(self):
|
|
"""
|
|
Segments the hand from the rest of the image to get a threshold.
|
|
"""
|
|
self.img_hsv = cv2.GaussianBlur(self.img_hsv,(5,5),0)
|
|
|
|
lower_skin = (0, 0, 153)
|
|
upper_skin = (45, 153, 255)
|
|
|
|
# Only need mask, as we can just use this to do the hand segmentation.
|
|
self.mask = cv2.inRange(self.img_hsv, lower_skin, upper_skin)
|
|
|
|
# Apply another blur to rmeove any small holes/noise
|
|
self.mask = self.__denoise(self.mask)
|
|
ret, self.mask = cv2.threshold(self.mask, 50, 255, cv2.THRESH_BINARY)
|
|
|
|
def __denoise(self, image):
|
|
"""
|
|
Applies a 5x5 gaussian blur to remove noise from the image.
|
|
"""
|
|
return cv2.GaussianBlur(image,(5,5),0)
|
|
|
|
def __calc_circle(self, image, radius_percent = 0.52):
|
|
"""
|
|
Calculates the equation of the circle (radius, centre) from a given
|
|
threshold image, so that the circle is the center of gravity of the
|
|
given threshold pixels, and the radius is by default 55% of the total
|
|
size.
|
|
"""
|
|
k = np.sum(self.mask) / 255
|
|
|
|
# Taking indices for num of rows.
|
|
x_ind = np.arange(0,self.mask.shape[1])
|
|
y_ind = np.arange(0,self.mask.shape[0])
|
|
coords_x = np.zeros((self.mask.shape[0], self.mask.shape[1]))
|
|
coords_y = np.zeros((self.mask.shape[0], self.mask.shape[1]))
|
|
coords_x[:,:] = x_ind
|
|
|
|
# Even this is extremely quick as it goes through rows in the numpy array, which in python is much faster than columns
|
|
for element in y_ind:
|
|
coords_y[element,:] = element
|
|
|
|
# Now need to get the average x value and y value for centre of gravity
|
|
centre = (int(np.sum(coords_x[self.mask == 255])/k), int(np.sum(coords_y[self.mask == 255])/k))
|
|
|
|
# Calculate radius of circle:
|
|
# May need to calculate diameter as well.
|
|
# Just take min/max x values and y values
|
|
x_min = np.min(coords_x[self.mask == 255])
|
|
x_max = np.max(coords_x[self.mask == 255])
|
|
y_min = np.min(coords_y[self.mask == 255])
|
|
y_max = np.max(coords_y[self.mask == 255])
|
|
|
|
candidate_pts = [(x_min, y_min), (x_min, y_max), (x_max, y_min), (x_max, y_max)]
|
|
radius = 0
|
|
|
|
# Check with each point to see which is furthest from the centre.
|
|
for pt in candidate_pts:
|
|
# Calculate Euclydian Distance
|
|
new_distance = ((pt[0] - centre[0])**2 + (pt[1] - centre[1])**2)**(1/2)
|
|
if new_distance > radius:
|
|
radius = new_distance
|
|
|
|
radius = int(radius * radius_percent)
|
|
|
|
return radius, centre
|
|
|
|
def __shift_pixels(self, image, shift_radius):
|
|
image[:,:,0] = image[:,:,0] + shift_radius
|
|
image[:,:,0] = np.where(image[:,:,0] > 179, image[:,:,0] - 179, image[:,:,0])
|
|
return image
|
|
|
|
def get_gesture(self):
|
|
"""
|
|
Calculates the actual gesture, returning the number of fingers
|
|
seen in the image.
|
|
"""
|
|
print('Getting Gesture')
|
|
if self.img is None:
|
|
print('There is no image')
|
|
return -1
|
|
# First cut out the frame using the neural network.
|
|
self.load_inference_graph()
|
|
print("loaded inference graph")
|
|
detections, scores = self.detect_hand_tensorflow(self.graph, self.sess)
|
|
|
|
print('Attempting to use pure hand recognition')
|
|
self.img_hsv = cv2.cvtColor(self.img, cv2.COLOR_BGR2HSV)
|
|
|
|
# Need to shift red pixels so they can be 0-20 rather than 250-~20
|
|
self.img_hsv = self.__shift_pixels(self.img_hsv, 30)
|
|
|
|
self.img_hsv = self.__denoise(self.img_hsv)
|
|
self.__segment_image()
|
|
|
|
print('calculating circle')
|
|
radius, centre = self.__calc_circle(self.mask)
|
|
print('Got circle')
|
|
|
|
# Now go around the circle to calculate num of times going 0->255 or vice-versa.
|
|
# First just do it the naive way with loops.
|
|
# Equation of the circle:
|
|
# y = sqrt(r2 - (x-c)2) + c
|
|
prev_x = centre[0] - radius
|
|
prev_y = [self.__calc_pos_y(centre[0] - radius, radius, centre), self.__calc_pos_y(centre[0] - radius, radius, centre)]
|
|
num_change = 0
|
|
|
|
# Make sure x is also within bounds.
|
|
x_start = centre[0] - radius + 1
|
|
if x_start < 0:
|
|
x_start = 0
|
|
|
|
x_end = centre[0] + radius
|
|
if x_end >= self.mask.shape[1]:
|
|
x_end = self.mask.shape[1] - 1
|
|
print(x_start)
|
|
print(x_end)
|
|
print(self.mask.shape)
|
|
for x in range(x_start, x_end):
|
|
# Need to check circle is inside the bounds.
|
|
ypos = self.__calc_pos_y(x, radius, centre)
|
|
# y above centre (ypos) and y below radius)
|
|
y = [ypos, centre[1] - (ypos-centre[1])]
|
|
|
|
if y[0] < 0:
|
|
y[0] = 0
|
|
if y[0] >= self.mask.shape[0]:
|
|
y[0] = self.mask.shape[0] - 1
|
|
if y[1] < 0:
|
|
y[1] = 0
|
|
if y[1] >= self.mask.shape[0]:
|
|
y[1] = self.mask.shape[0] - 1
|
|
if(self.mask[y[0], x] != self.mask[prev_y[0], prev_x]):
|
|
num_change += 1
|
|
if self.mask[y[1], x] != self.mask[prev_y[1], prev_x] and y[0] != y[1]:
|
|
num_change += 1
|
|
prev_x = x
|
|
prev_y = y
|
|
|
|
print('Finished calculating, returning')
|
|
|
|
return num_change / 2 - 1
|
|
|
|
def setFrame(self, frame):
|
|
self.img = frame
|
|
|
|
# Source: Victor Dibia
|
|
# Link: https://github.com/victordibia/handtracking
|
|
# Taken the code straight from his example, as it works perfectly. This is specifically
|
|
# from the load_inference_graph method that he wrote, and will load the graph into
|
|
# memory if one has not already been loaded for this object.
|
|
def load_inference_graph(self):
|
|
"""Loads a tensorflow model checkpoint into memory"""
|
|
|
|
if self.graph != None and self.sess != None:
|
|
# Don't load more than once.
|
|
return
|
|
|
|
PATH_TO_CKPT = '/Users/piv/Documents/Projects/car/GestureRecognition/frozen_inference_graph.pb'
|
|
# load frozen tensorflow model into memory
|
|
detection_graph = tf.Graph()
|
|
with detection_graph.as_default():
|
|
od_graph_def = tf.GraphDef()
|
|
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
|
|
serialized_graph = fid.read()
|
|
od_graph_def.ParseFromString(serialized_graph)
|
|
tf.import_graph_def(od_graph_def, name='')
|
|
sess = tf.Session(graph=detection_graph)
|
|
self.graph = detection_graph
|
|
self.sess = sess
|
|
|
|
|
|
# Source: Victor Dibia
|
|
# Link: https://github.com/victordibia/handtracking
|
|
# Taken the code straight from his example, as it works perfectly. This is specifically
|
|
# from the detect_hand method that he wrote, as other processing is required for the
|
|
# hand recognition to work correctly.
|
|
def detect_hand_tensorflow(self, detection_graph, sess):
|
|
""" Detects hands in a frame using a CNN
|
|
|
|
detection_graph -- The CNN to use to detect the hand.
|
|
sess -- THe tensorflow session for the given graph
|
|
"""
|
|
|
|
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
|
|
|
|
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
|
|
|
|
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
|
|
|
|
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
|
|
|
|
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
|
|
|
|
img_expanded = np.expand_dims(self.img, axis=0)
|
|
|
|
(boxes, scores, classes, num) = sess.run(
|
|
[detection_boxes, detection_scores, detection_classes, num_detections],
|
|
feed_dict={image_tensor, img_expanded})
|
|
print('finished detection')
|
|
return np.squeeze(boxes), np.squeeze(scores)
|
|
|
|
def detect_hand_opencv(self, detection_graph, sess):
|
|
"""Performs hand detection using a CNN from tensorflow using opencv.
|
|
|
|
detection_graph -- The CNN to use to detect the hand.
|
|
sess -- THe tensorflow session for the given graph
|
|
"""
|
|
if self.img is None:
|
|
return
|
|
|
|
height = self.img.shape[0]
|
|
width = self.img.shape[1]
|
|
|
|
scale = 0.5
|
|
|
|
classes = None
|
|
|
|
net = cv2.dnn.readNetFromTensorflow(detection_graph, sess)
|
|
|
|
# width is scaled weirdly to ensure we keep tbe same ratio as the original image.
|
|
net.setInput(cv2.dnn.blobFromImage(self.img, scale, size=(300, 300 * (width/height)), swapRB=True, crop=False))
|
|
netOut = net.forward()
|
|
|
|
# Format output to look same as tensorflow output.
|
|
scores = []
|
|
boxes = []
|
|
|
|
for out in netOut:
|
|
for detection in out[0,0]:
|
|
scores.append(detection[2])
|
|
boxes.append(detection[3], detection[4], detection[5], detection[6])
|
|
# Only doing first class as only trying to find the hand.
|
|
break
|
|
return np.array(boxes), np.array(scores)
|
|
|
|
def get_best_hand(self, boxes, scores, conf_thresh, nms_thresh):
|
|
"""
|
|
Gets the best hand bounding box by inspecting confidence scores and overlapping
|
|
boxes, as well as the overall size of each box to determine which hand (if multiple present)
|
|
should be tested to recognise.
|
|
"""
|
|
# First remove any boxes below confidence threshold
|
|
confident_bs = boxes[scores > conf_thresh]
|
|
|
|
# Then use NMS to get rid of heavily overlapping boxes.
|
|
# This wasn't used in the tensorflow example that was found, however probably a
|
|
# good idea to use it just in case.
|
|
indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thresh, nms_thresh)
|
|
|
|
# Finally calculate area of each box to determine which hand is clearest (biggest in image)
|
|
# Just does the most confident for now.
|
|
max_conf = 0
|
|
max_index = 0
|
|
for conf in scores:
|
|
if conf > max_conf:
|
|
max_conf = conf
|
|
max_index = conf
|
|
return boxes[max_index]
|
|
|
|
|