Merge branch 'dev' of https://gitlab.ilabt.imec.be/wesign/sign-predictor into dev
This commit is contained in:
31
export.py
Normal file
31
export.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
import torch
|
||||||
|
import torchvision
|
||||||
|
import onnx
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from src.model import SPOTER
|
||||||
|
from src.identifiers import LANDMARKS
|
||||||
|
|
||||||
|
model_name = 'Fingerspelling_AE'
|
||||||
|
|
||||||
|
# load PyTorch model from .pth file
|
||||||
|
model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2)
|
||||||
|
state_dict = torch.load('models/' + model_name + '.pth')
|
||||||
|
model.load_state_dict(state_dict)
|
||||||
|
|
||||||
|
# set model to evaluation mode
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# create dummy input tensor
|
||||||
|
batch_size = 1
|
||||||
|
num_of_frames = 1
|
||||||
|
input_shape = (108, num_of_frames)
|
||||||
|
dummy_input = torch.randn(batch_size, *input_shape)
|
||||||
|
|
||||||
|
# export model to ONNX format
|
||||||
|
output_file = 'models/' + model_name + '.onnx'
|
||||||
|
torch.onnx.export(model, dummy_input, output_file, input_names=['input'], output_names=['output'])
|
||||||
|
|
||||||
|
# load exported ONNX model for verification
|
||||||
|
onnx_model = onnx.load(output_file)
|
||||||
|
onnx.checker.check_model(onnx_model)
|
||||||
BIN
models/Fingerspelling_AE.onnx
Normal file
BIN
models/Fingerspelling_AE.onnx
Normal file
Binary file not shown.
BIN
models/Fingerspelling_AE.pth
Normal file
BIN
models/Fingerspelling_AE.pth
Normal file
Binary file not shown.
@@ -57,7 +57,7 @@ class FingerSpellingDataset(torch.utils.data.Dataset):
|
|||||||
video_name = self.data[index]
|
video_name = self.data[index]
|
||||||
|
|
||||||
# get the keypoints for the video
|
# get the keypoints for the video
|
||||||
keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name, normalize=True)
|
keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name, normalize="minxmax")
|
||||||
|
|
||||||
# filter the keypoints by the identified subset
|
# filter the keypoints by the identified subset
|
||||||
if self.keypoints_to_keep:
|
if self.keypoints_to_keep:
|
||||||
|
|||||||
@@ -27,14 +27,16 @@ class KeypointExtractor:
|
|||||||
|
|
||||||
def extract_keypoints_from_video(self,
|
def extract_keypoints_from_video(self,
|
||||||
video: str,
|
video: str,
|
||||||
normalize: bool = False,
|
normalize: str = None,
|
||||||
draw: bool = False,
|
draw: bool = False,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""extract_keypoints_from_video this function extracts keypoints from a video and stores them in a dataframe
|
"""extract_keypoints_from_video this function extracts keypoints from a video and stores them in a dataframe
|
||||||
|
|
||||||
:param video: the video to extract keypoints from
|
:param video: the video to extract keypoints from
|
||||||
:type video: str
|
:type video: str
|
||||||
:return: dataframe with keypoints
|
:param normalize: the hand normalization algorithm to use, defaults to None
|
||||||
|
:type normalize: str, optional
|
||||||
|
:return: dataframe with keypoints in absolute pixels
|
||||||
:rtype: pd.DataFrame
|
:rtype: pd.DataFrame
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -53,7 +55,7 @@ class KeypointExtractor:
|
|||||||
# create dataframe from cache
|
# create dataframe from cache
|
||||||
df = pd.DataFrame(np.load(self.cache_folder + "/" + video + ".npy", allow_pickle=True), columns=self.columns)
|
df = pd.DataFrame(np.load(self.cache_folder + "/" + video + ".npy", allow_pickle=True), columns=self.columns)
|
||||||
if normalize:
|
if normalize:
|
||||||
df = self.normalize_hands(df)
|
df = self.normalize_hands(df, norm_algorithm=normalize)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
# open video
|
# open video
|
||||||
@@ -98,6 +100,14 @@ class KeypointExtractor:
|
|||||||
new_df = pd.DataFrame(data, columns=self.columns)
|
new_df = pd.DataFrame(data, columns=self.columns)
|
||||||
keypoints_df = pd.concat([keypoints_df, new_df], ignore_index=True)
|
keypoints_df = pd.concat([keypoints_df, new_df], ignore_index=True)
|
||||||
|
|
||||||
|
# get frame width and height
|
||||||
|
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
|
||||||
|
# convert to pixels
|
||||||
|
keypoints_df.iloc[:, ::2] *= frame_width
|
||||||
|
keypoints_df.iloc[:, 1::2] *= frame_height
|
||||||
|
|
||||||
# close video
|
# close video
|
||||||
cap.release()
|
cap.release()
|
||||||
|
|
||||||
@@ -105,7 +115,7 @@ class KeypointExtractor:
|
|||||||
np.save(self.cache_folder + "/" + video + ".npy", keypoints_df.to_numpy())
|
np.save(self.cache_folder + "/" + video + ".npy", keypoints_df.to_numpy())
|
||||||
|
|
||||||
if normalize:
|
if normalize:
|
||||||
keypoints_df = self.normalize_hands(keypoints_df)
|
keypoints_df = self.normalize_hands(keypoints_df, norm_algorithm=normalize)
|
||||||
|
|
||||||
if draw:
|
if draw:
|
||||||
return keypoints_df, output_frames
|
return keypoints_df, output_frames
|
||||||
@@ -133,16 +143,18 @@ class KeypointExtractor:
|
|||||||
self.mp_drawing.draw_landmarks(draw_image, results.left_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
|
self.mp_drawing.draw_landmarks(draw_image, results.left_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
|
||||||
self.mp_drawing.draw_landmarks(draw_image, results.right_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
|
self.mp_drawing.draw_landmarks(draw_image, results.right_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
|
||||||
|
|
||||||
|
img_width, img_height = image.shape[1], image.shape[0]
|
||||||
|
|
||||||
# create bounding box around hands
|
# create bounding box around hands
|
||||||
if results.left_hand_landmarks:
|
if results.left_hand_landmarks:
|
||||||
x = [landmark.x for landmark in results.left_hand_landmarks.landmark]
|
x = [landmark.x for landmark in results.left_hand_landmarks.landmark]
|
||||||
y = [landmark.y for landmark in results.left_hand_landmarks.landmark]
|
y = [landmark.y for landmark in results.left_hand_landmarks.landmark]
|
||||||
draw_image = cv2.rectangle(draw_image, (int(min(x) * 640), int(min(y) * 480)), (int(max(x) * 640), int(max(y) * 480)), (255, 0, 0), 2)
|
draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (0, 255, 0), 2)
|
||||||
|
|
||||||
if results.right_hand_landmarks:
|
if results.right_hand_landmarks:
|
||||||
x = [landmark.x for landmark in results.right_hand_landmarks.landmark]
|
x = [landmark.x for landmark in results.right_hand_landmarks.landmark]
|
||||||
y = [landmark.y for landmark in results.right_hand_landmarks.landmark]
|
y = [landmark.y for landmark in results.right_hand_landmarks.landmark]
|
||||||
draw_image = cv2.rectangle(draw_image, (int(min(x) * 640), int(min(y) * 480)), (int(max(x) * 640), int(max(y) * 480)), (255, 0, 0), 2)
|
draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (255, 0, 0), 2)
|
||||||
|
|
||||||
self.mp_drawing.draw_landmarks(draw_image, results.pose_landmarks, self.mp_holistic.POSE_CONNECTIONS)
|
self.mp_drawing.draw_landmarks(draw_image, results.pose_landmarks, self.mp_holistic.POSE_CONNECTIONS)
|
||||||
|
|
||||||
@@ -240,14 +252,21 @@ class KeypointExtractor:
|
|||||||
min_x, min_y = np.min(hand_coords[:, :, 0], axis=1), np.min(hand_coords[:, :, 1], axis=1)
|
min_x, min_y = np.min(hand_coords[:, :, 0], axis=1), np.min(hand_coords[:, :, 1], axis=1)
|
||||||
max_x, max_y = np.max(hand_coords[:, :, 0], axis=1), np.max(hand_coords[:, :, 1], axis=1)
|
max_x, max_y = np.max(hand_coords[:, :, 0], axis=1), np.max(hand_coords[:, :, 1], axis=1)
|
||||||
|
|
||||||
# calculate the deltas
|
# calculate the hand keypoint width and height (NOT the bounding box width and height!)
|
||||||
width, height = max_x - min_x, max_y - min_y
|
width, height = max_x - min_x, max_y - min_y
|
||||||
if width > height:
|
|
||||||
delta_x = 0.1 * width
|
# initialize empty arrays for deltas
|
||||||
delta_y = delta_x + ((width - height) / 2)
|
delta_x = np.zeros(width.shape, dtype='float64')
|
||||||
else:
|
delta_y = np.zeros(height.shape, dtype='float64')
|
||||||
delta_y = 0.1 * height
|
|
||||||
delta_x = delta_y + ((height - width) / 2)
|
# calculate the deltas
|
||||||
|
mask = width>height
|
||||||
|
# width > height
|
||||||
|
delta_x[mask] = (0.1 * width)[mask]
|
||||||
|
delta_y[mask] = (delta_x + ((width - height) / 2))[mask]
|
||||||
|
# height >= width
|
||||||
|
delta_y[~mask] = (0.1 * height)[~mask]
|
||||||
|
delta_x[~mask] = (delta_y + ((height - width) / 2))[~mask]
|
||||||
|
|
||||||
# Set the starting and ending point of the normalization bounding box
|
# Set the starting and ending point of the normalization bounding box
|
||||||
starting_x, starting_y = min_x - delta_x, min_y - delta_y
|
starting_x, starting_y = min_x - delta_x, min_y - delta_y
|
||||||
@@ -255,10 +274,10 @@ class KeypointExtractor:
|
|||||||
|
|
||||||
# calculate the center of the bounding box and the bounding box dimensions
|
# calculate the center of the bounding box and the bounding box dimensions
|
||||||
bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
|
bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
|
||||||
bbox_width, bbox_height = starting_x - ending_x, starting_y - ending_y
|
bbox_width, bbox_height = ending_x - starting_x, ending_y - starting_y
|
||||||
|
|
||||||
# repeat the center coordinates and bounding box dimensions to match the shape of hand_coords
|
# repeat the center coordinates and bounding box dimensions to match the shape of hand_coords
|
||||||
center_x, center_y = center_x.reshape(-1, 1, 1), center_y.reshape(-1, 1, 1)
|
bbox_center_x, bbox_center_y = bbox_center_x.reshape(-1, 1, 1), bbox_center_y.reshape(-1, 1, 1)
|
||||||
center_coords = np.concatenate((np.tile(bbox_center_x, (1, 21, 1)), np.tile(bbox_center_y, (1, 21, 1))), axis=2)
|
center_coords = np.concatenate((np.tile(bbox_center_x, (1, 21, 1)), np.tile(bbox_center_y, (1, 21, 1))), axis=2)
|
||||||
|
|
||||||
bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
|
bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
|
||||||
|
|||||||
@@ -18,7 +18,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"video_name = 'A_robbe.mp4' "
|
"video_name = '69547.mp4' "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -28,7 +28,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# extract keypoints\n",
|
"# extract keypoints\n",
|
||||||
"keypoint_extractor = KeypointExtractor('data/fingerspelling/data/')"
|
"keypoint_extractor = KeypointExtractor('data/videos/')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -48,7 +48,7 @@
|
|||||||
"duration = 10\n",
|
"duration = 10\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Create a dummy video of random noise\n",
|
"# Create a dummy video of random noise\n",
|
||||||
"_, video_frames = keypoint_extractor.extract_keypoints_from_video(video_name, draw=True)\n",
|
"_, video_frames = keypoint_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\", draw=True)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Convert the video to a numpy array\n",
|
"# Convert the video to a numpy array\n",
|
||||||
"video = np.array(video_frames)\n",
|
"video = np.array(video_frames)\n",
|
||||||
@@ -135,9 +135,9 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#Set video, hand and frame to display\n",
|
"#Set video, hand and frame to display\n",
|
||||||
"video_name = 'A_victor.mp4'\n",
|
"video_name = '69547.mp4'\n",
|
||||||
"hand = \"right\"\n",
|
"hand = \"right\"\n",
|
||||||
"frame = 1\n",
|
"frame = 3\n",
|
||||||
"%reload_ext autoreload"
|
"%reload_ext autoreload"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -151,11 +151,11 @@
|
|||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Extract keypoints from requested video\n",
|
"#Extract keypoints from requested video\n",
|
||||||
"keypoints_extractor = KeypointExtractor(\"data/fingerspelling/data/\")\n",
|
"keypoints_extractor = KeypointExtractor(\"data/videos/\")\n",
|
||||||
"\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"#Plot the hand keypoints\n",
|
"#Plot the hand keypoints\n",
|
||||||
"df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=False)\n",
|
"df = keypoints_extractor.extract_keypoints_from_video(video_name)\n",
|
||||||
|
"df.head()\n",
|
||||||
"plot_hand_keypoints(df, hand, frame)"
|
"plot_hand_keypoints(df, hand, frame)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -165,10 +165,42 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#Plot the NORMALIZED hand keypoints\n",
|
"#Plot the NORMALIZED hand keypoints (using minxmax)\n",
|
||||||
"df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=True)\n",
|
"df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\")\n",
|
||||||
"plot_hand_keypoints(df, hand, frame)"
|
"plot_hand_keypoints(df, hand, frame)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#Plot the NORMALIZED hand keypoints (using bohacek)\n",
|
||||||
|
"df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"bohacek\")\n",
|
||||||
|
"plot_hand_keypoints(df, hand, frame)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|||||||
@@ -61,35 +61,63 @@ while True:
|
|||||||
if k1 and (k2 or k3):
|
if k1 and (k2 or k3):
|
||||||
data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])
|
data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])
|
||||||
|
|
||||||
def normalize_hand(frame, data, hand):
|
def normalize_hand(frame, data, hand, algorithm="minmax"):
|
||||||
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
|
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
|
||||||
hand_data = np.array(data[0])[hand_columns]
|
hand_data = np.array(data[0])[hand_columns]
|
||||||
|
|
||||||
|
# convert to absolute pixels
|
||||||
hand_data = hand_data.reshape(21, 2)
|
hand_data = hand_data.reshape(21, 2)
|
||||||
|
hand_data[:, 0] *= frame.shape[1]
|
||||||
|
hand_data[:, 1] *= frame.shape[0]
|
||||||
|
|
||||||
min_x, min_y = np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
|
min_x, min_y = np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
|
||||||
max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])
|
max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])
|
||||||
|
|
||||||
|
width, height = max_x - min_x, max_y - min_y
|
||||||
|
|
||||||
|
if algorithm == "minmax":
|
||||||
|
bbox_height, bbox_width = height, width
|
||||||
center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
|
center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
|
||||||
|
|
||||||
bbox_width, bbox_height = max_x - min_x, max_y - min_y
|
starting_x, starting_y = min_x, min_y
|
||||||
|
ending_x, ending_y = max_x, max_y
|
||||||
|
|
||||||
|
elif algorithm == "bohacek":
|
||||||
|
if width > height:
|
||||||
|
delta_x = 0.1 * width
|
||||||
|
delta_y = delta_x + ((width - height) / 2)
|
||||||
|
else:
|
||||||
|
delta_y = 0.1 * height
|
||||||
|
delta_x = delta_y + ((height - width) / 2)
|
||||||
|
|
||||||
|
starting_x, starting_y = min_x - delta_x, min_y - delta_y
|
||||||
|
ending_x, ending_y = max_x + delta_x, max_y + delta_y
|
||||||
|
|
||||||
|
center_x, center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
|
||||||
|
bbox_height, bbox_width = ending_y - starting_y, ending_x - starting_x
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("Not a valid normalization algorithm")
|
||||||
|
return data, frame
|
||||||
|
|
||||||
if bbox_height == 0 or bbox_width == 0:
|
if bbox_height == 0 or bbox_width == 0:
|
||||||
return data, frame
|
return data, frame
|
||||||
|
|
||||||
center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)
|
center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)
|
||||||
|
bbox_dims = np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)
|
||||||
|
|
||||||
hand_data = (hand_data - center_coords) / np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)
|
hand_data = (hand_data - center_coords) / bbox_dims
|
||||||
|
|
||||||
# add bouding box to frame
|
# add bouding box to frame
|
||||||
frame = cv2.rectangle(frame, (int(min_x * frame.shape[1]), int(min_y * frame.shape[0])), (int(max_x * frame.shape[1]), int(max_y * frame.shape[0])), (0, 255, 0), 2)
|
frame = cv2.rectangle(frame, (int(starting_x), int(starting_y)), (int(ending_x), int(ending_y)), (0, 255, 0), 2)
|
||||||
|
|
||||||
data[:, hand_columns] = hand_data.reshape(-1, 42)
|
data[:, hand_columns] = hand_data.reshape(-1, 42)
|
||||||
return data, frame
|
return data, frame
|
||||||
|
|
||||||
data, frame = normalize_hand(frame, data, "left_hand")
|
norm_alg = "minmax"
|
||||||
data, frame = normalize_hand(frame, data, "right_hand")
|
|
||||||
|
data, frame = normalize_hand(frame, data, "left_hand", norm_alg)
|
||||||
|
data, frame = normalize_hand(frame, data, "right_hand", norm_alg)
|
||||||
|
|
||||||
# get values of the landmarks as a list of integers
|
# get values of the landmarks as a list of integers
|
||||||
values = []
|
values = []
|
||||||
|
|||||||
Reference in New Issue
Block a user