Merge branch 'dev' into 'main'
Release Sprint 2 Closes WES-41 See merge request wesign/sign-predictor!11
This commit was merged in pull request #11.
This commit is contained in:
@@ -7,7 +7,7 @@ steps:
|
|||||||
pull: if-not-exists
|
pull: if-not-exists
|
||||||
image: sonarsource/sonar-scanner-cli
|
image: sonarsource/sonar-scanner-cli
|
||||||
commands:
|
commands:
|
||||||
- sonar-scanner -Dsonar.host.url=$SONAR_HOST -Dsonar.login=$SONAR_TOKEN -Dsonar.projectKey=$SONAR_PROJECT_KEY
|
- sonar-scanner -Dsonar.host.url=$SONAR_HOST -Dsonar.login=$SONAR_TOKEN -Dsonar.projectKey=$SONAR_PROJECT_KEY -Dsonar.qualitygate.wait=true
|
||||||
environment:
|
environment:
|
||||||
SONAR_HOST:
|
SONAR_HOST:
|
||||||
from_secret: sonar_host
|
from_secret: sonar_host
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -6,3 +6,6 @@ cache/
|
|||||||
cache_wlasl/
|
cache_wlasl/
|
||||||
|
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|
||||||
|
checkpoints/
|
||||||
|
.ipynb_checkpoints
|
||||||
0
__init__.py
Normal file
0
__init__.py
Normal file
120
analyze_model.ipynb
Normal file
120
analyze_model.ipynb
Normal file
File diff suppressed because one or more lines are too long
31
export.py
Normal file
31
export.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
import torch
|
||||||
|
import torchvision
|
||||||
|
import onnx
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from src.model import SPOTER
|
||||||
|
from src.identifiers import LANDMARKS
|
||||||
|
|
||||||
|
model_name = 'Fingerspelling_AE'
|
||||||
|
|
||||||
|
# load PyTorch model from .pth file
|
||||||
|
model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2)
|
||||||
|
state_dict = torch.load('models/' + model_name + '.pth')
|
||||||
|
model.load_state_dict(state_dict)
|
||||||
|
|
||||||
|
# set model to evaluation mode
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# create dummy input tensor
|
||||||
|
batch_size = 1
|
||||||
|
num_of_frames = 1
|
||||||
|
input_shape = (108, num_of_frames)
|
||||||
|
dummy_input = torch.randn(batch_size, *input_shape)
|
||||||
|
|
||||||
|
# export model to ONNX format
|
||||||
|
output_file = 'models/' + model_name + '.onnx'
|
||||||
|
torch.onnx.export(model, dummy_input, output_file, input_names=['input'], output_names=['output'])
|
||||||
|
|
||||||
|
# load exported ONNX model for verification
|
||||||
|
onnx_model = onnx.load(output_file)
|
||||||
|
onnx.checker.check_model(onnx_model)
|
||||||
BIN
models/Fingerspelling_AE.onnx
Normal file
BIN
models/Fingerspelling_AE.onnx
Normal file
Binary file not shown.
BIN
models/Fingerspelling_AE.pth
Normal file
BIN
models/Fingerspelling_AE.pth
Normal file
Binary file not shown.
BIN
models/model_A-E.pth
Normal file
BIN
models/model_A-E.pth
Normal file
Binary file not shown.
BIN
models/model_A-L.pth
Normal file
BIN
models/model_A-L.pth
Normal file
Binary file not shown.
@@ -2,4 +2,5 @@ torch==1.13.1
|
|||||||
torchvision==0.14.1
|
torchvision==0.14.1
|
||||||
pandas==1.5.3
|
pandas==1.5.3
|
||||||
mediapipe==0.9.1.0
|
mediapipe==0.9.1.0
|
||||||
tensorboard==2.12.0
|
tensorboard==2.12.0
|
||||||
|
mediapy==1.1.6
|
||||||
11
src/augmentations.py
Normal file
11
src/augmentations.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
class MirrorKeypoints:
|
||||||
|
def __call__(self, sample):
|
||||||
|
if random.random() > 0.5:
|
||||||
|
return sample
|
||||||
|
# flip the keypoints tensor
|
||||||
|
sample = 1 - sample
|
||||||
|
|
||||||
|
return sample
|
||||||
@@ -33,7 +33,7 @@ class FingerSpellingDataset(torch.utils.data.Dataset):
|
|||||||
# TODO: make split for train and val and test when enough data is available
|
# TODO: make split for train and val and test when enough data is available
|
||||||
|
|
||||||
# split the data into train and val and test and make them balanced
|
# split the data into train and val and test and make them balanced
|
||||||
x_train, x_test, y_train, y_test = train_test_split(files, labels, test_size=0.4, random_state=1, stratify=labels)
|
x_train, x_test, y_train, y_test = train_test_split(files, labels, test_size=0.3, random_state=1, stratify=labels)
|
||||||
|
|
||||||
if subset == "train":
|
if subset == "train":
|
||||||
self.data = x_train
|
self.data = x_train
|
||||||
@@ -57,7 +57,7 @@ class FingerSpellingDataset(torch.utils.data.Dataset):
|
|||||||
video_name = self.data[index]
|
video_name = self.data[index]
|
||||||
|
|
||||||
# get the keypoints for the video
|
# get the keypoints for the video
|
||||||
keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name)
|
keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name, normalize="minxmax")
|
||||||
|
|
||||||
# filter the keypoints by the identified subset
|
# filter the keypoints by the identified subset
|
||||||
if self.keypoints_to_keep:
|
if self.keypoints_to_keep:
|
||||||
@@ -73,4 +73,7 @@ class FingerSpellingDataset(torch.utils.data.Dataset):
|
|||||||
# data to tensor
|
# data to tensor
|
||||||
data = torch.from_numpy(current_row)
|
data = torch.from_numpy(current_row)
|
||||||
|
|
||||||
|
if self.transform:
|
||||||
|
data = self.transform(data)
|
||||||
|
|
||||||
return data, label
|
return data, label
|
||||||
@@ -4,8 +4,8 @@ from collections import OrderedDict
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from identifiers import LANDMARKS
|
from src.identifiers import LANDMARKS
|
||||||
from keypoint_extractor import KeypointExtractor
|
from src.keypoint_extractor import KeypointExtractor
|
||||||
|
|
||||||
|
|
||||||
class WLASLDataset(torch.utils.data.Dataset):
|
class WLASLDataset(torch.utils.data.Dataset):
|
||||||
|
|||||||
@@ -27,27 +27,36 @@ class KeypointExtractor:
|
|||||||
|
|
||||||
def extract_keypoints_from_video(self,
|
def extract_keypoints_from_video(self,
|
||||||
video: str,
|
video: str,
|
||||||
|
normalize: str = None,
|
||||||
|
draw: bool = False,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""extract_keypoints_from_video this function extracts keypoints from a video and stores them in a dataframe
|
"""extract_keypoints_from_video this function extracts keypoints from a video and stores them in a dataframe
|
||||||
|
|
||||||
:param video: the video to extract keypoints from
|
:param video: the video to extract keypoints from
|
||||||
:type video: str
|
:type video: str
|
||||||
:return: dataframe with keypoints
|
:param normalize: the hand normalization algorithm to use, defaults to None
|
||||||
|
:type normalize: str, optional
|
||||||
|
:return: dataframe with keypoints in absolute pixels
|
||||||
:rtype: pd.DataFrame
|
:rtype: pd.DataFrame
|
||||||
"""
|
"""
|
||||||
# check if video exists
|
|
||||||
if not os.path.exists(self.video_folder + video):
|
|
||||||
logging.error("Video does not exist at path: " + self.video_folder + video)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# check if cache exists
|
if not draw:
|
||||||
if not os.path.exists(self.cache_folder):
|
# check if video exists
|
||||||
os.makedirs(self.cache_folder)
|
if not os.path.exists(self.video_folder + video):
|
||||||
|
logging.error("Video does not exist at path: " + self.video_folder + video)
|
||||||
|
return None
|
||||||
|
|
||||||
# check if cache file exists and return
|
# check if cache exists
|
||||||
if os.path.exists(self.cache_folder + "/" + video + ".npy"):
|
if not os.path.exists(self.cache_folder):
|
||||||
# create dataframe from cache
|
os.makedirs(self.cache_folder)
|
||||||
return pd.DataFrame(np.load(self.cache_folder + "/" + video + ".npy", allow_pickle=True), columns=self.columns)
|
|
||||||
|
# check if cache file exists and return
|
||||||
|
if os.path.exists(self.cache_folder + "/" + video + ".npy"):
|
||||||
|
# create dataframe from cache
|
||||||
|
df = pd.DataFrame(np.load(self.cache_folder + "/" + video + ".npy", allow_pickle=True), columns=self.columns)
|
||||||
|
if normalize:
|
||||||
|
df = self.normalize_hands(df, norm_algorithm=normalize)
|
||||||
|
return df
|
||||||
|
|
||||||
# open video
|
# open video
|
||||||
cap = cv2.VideoCapture(self.video_folder + video)
|
cap = cv2.VideoCapture(self.video_folder + video)
|
||||||
@@ -56,7 +65,9 @@ class KeypointExtractor:
|
|||||||
|
|
||||||
# extract frames from video so we extract 5 frames per second
|
# extract frames from video so we extract 5 frames per second
|
||||||
frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
|
frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
|
||||||
frame_skip = frame_rate // 5
|
frame_skip = frame_rate // 10
|
||||||
|
|
||||||
|
output_frames = []
|
||||||
|
|
||||||
while cap.isOpened():
|
while cap.isOpened():
|
||||||
|
|
||||||
@@ -70,7 +81,11 @@ class KeypointExtractor:
|
|||||||
if not success:
|
if not success:
|
||||||
break
|
break
|
||||||
# extract keypoints of frame
|
# extract keypoints of frame
|
||||||
results = self.extract_keypoints_from_frame(image)
|
if draw:
|
||||||
|
results, draw_image = self.extract_keypoints_from_frame(image, draw=True)
|
||||||
|
output_frames.append(draw_image)
|
||||||
|
else:
|
||||||
|
results = self.extract_keypoints_from_frame(image)
|
||||||
|
|
||||||
def extract_keypoints(landmarks):
|
def extract_keypoints(landmarks):
|
||||||
if landmarks:
|
if landmarks:
|
||||||
@@ -80,8 +95,18 @@ class KeypointExtractor:
|
|||||||
k1 = extract_keypoints(results.pose_landmarks)
|
k1 = extract_keypoints(results.pose_landmarks)
|
||||||
k2 = extract_keypoints(results.left_hand_landmarks)
|
k2 = extract_keypoints(results.left_hand_landmarks)
|
||||||
k3 = extract_keypoints(results.right_hand_landmarks)
|
k3 = extract_keypoints(results.right_hand_landmarks)
|
||||||
if k1 and k2 and k3:
|
if k1 and (k2 or k3):
|
||||||
keypoints_df = pd.concat([keypoints_df, pd.DataFrame([k1+k2+k3], columns=self.columns)])
|
data = [k1 + (k2 or [0] * 42) + (k3 or [0] * 42)]
|
||||||
|
new_df = pd.DataFrame(data, columns=self.columns)
|
||||||
|
keypoints_df = pd.concat([keypoints_df, new_df], ignore_index=True)
|
||||||
|
|
||||||
|
# get frame width and height
|
||||||
|
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
|
||||||
|
# convert to pixels
|
||||||
|
keypoints_df.iloc[:, ::2] *= frame_width
|
||||||
|
keypoints_df.iloc[:, 1::2] *= frame_height
|
||||||
|
|
||||||
# close video
|
# close video
|
||||||
cap.release()
|
cap.release()
|
||||||
@@ -89,6 +114,12 @@ class KeypointExtractor:
|
|||||||
# save keypoints to cache
|
# save keypoints to cache
|
||||||
np.save(self.cache_folder + "/" + video + ".npy", keypoints_df.to_numpy())
|
np.save(self.cache_folder + "/" + video + ".npy", keypoints_df.to_numpy())
|
||||||
|
|
||||||
|
if normalize:
|
||||||
|
keypoints_df = self.normalize_hands(keypoints_df, norm_algorithm=normalize)
|
||||||
|
|
||||||
|
if draw:
|
||||||
|
return keypoints_df, output_frames
|
||||||
|
|
||||||
return keypoints_df
|
return keypoints_df
|
||||||
|
|
||||||
|
|
||||||
@@ -108,11 +139,156 @@ class KeypointExtractor:
|
|||||||
if draw:
|
if draw:
|
||||||
# Draw the pose annotations on the image
|
# Draw the pose annotations on the image
|
||||||
draw_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
draw_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||||
self.mp_drawing.draw_landmarks(draw_image, results.face_landmarks, self.mp_holistic.FACEMESH_CONTOURS)
|
# self.mp_drawing.draw_landmarks(draw_image, results.face_landmarks, self.mp_holistic.FACEMESH_CONTOURS)
|
||||||
self.mp_drawing.draw_landmarks(draw_image, results.left_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
|
self.mp_drawing.draw_landmarks(draw_image, results.left_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
|
||||||
self.mp_drawing.draw_landmarks(draw_image, results.right_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
|
self.mp_drawing.draw_landmarks(draw_image, results.right_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
|
||||||
|
|
||||||
|
img_width, img_height = image.shape[1], image.shape[0]
|
||||||
|
|
||||||
|
# create bounding box around hands
|
||||||
|
if results.left_hand_landmarks:
|
||||||
|
x = [landmark.x for landmark in results.left_hand_landmarks.landmark]
|
||||||
|
y = [landmark.y for landmark in results.left_hand_landmarks.landmark]
|
||||||
|
draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (0, 255, 0), 2)
|
||||||
|
|
||||||
|
if results.right_hand_landmarks:
|
||||||
|
x = [landmark.x for landmark in results.right_hand_landmarks.landmark]
|
||||||
|
y = [landmark.y for landmark in results.right_hand_landmarks.landmark]
|
||||||
|
draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (255, 0, 0), 2)
|
||||||
|
|
||||||
self.mp_drawing.draw_landmarks(draw_image, results.pose_landmarks, self.mp_holistic.POSE_CONNECTIONS)
|
self.mp_drawing.draw_landmarks(draw_image, results.pose_landmarks, self.mp_holistic.POSE_CONNECTIONS)
|
||||||
|
|
||||||
return results, draw_image
|
return results, draw_image
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_hands(self, dataframe: pd.DataFrame, norm_algorithm: str="minmax") -> pd.DataFrame:
|
||||||
|
"""normalize_hand this function normalizes the hand keypoints of a dataframe
|
||||||
|
|
||||||
|
:param dataframe: the dataframe to normalize
|
||||||
|
:type dataframe: pd.DataFrame
|
||||||
|
:param norm_algorithm: the normalization algorithm to use, pick from "minmax" and "bohacek"
|
||||||
|
:type norm_algorithm: str
|
||||||
|
:return: the normalized dataframe
|
||||||
|
:rtype: pd.DataFrame
|
||||||
|
"""
|
||||||
|
|
||||||
|
if norm_algorithm == "minmax":
|
||||||
|
# normalize left hand
|
||||||
|
dataframe = self.normalize_hand_minmax(dataframe, "left_hand")
|
||||||
|
# normalize right hand
|
||||||
|
dataframe = self.normalize_hand_minmax(dataframe, "right_hand")
|
||||||
|
elif norm_algorithm == "bohacek":
|
||||||
|
# normalize left hand
|
||||||
|
dataframe = self.normalize_hand_bohacek(dataframe, "left_hand")
|
||||||
|
# normalize right hand
|
||||||
|
dataframe = self.normalize_hand_bohacek(dataframe, "right_hand")
|
||||||
|
else:
|
||||||
|
return dataframe
|
||||||
|
|
||||||
|
return dataframe
|
||||||
|
|
||||||
|
def normalize_hand_minmax(self, dataframe: pd.DataFrame, hand: str) -> pd.DataFrame:
|
||||||
|
"""normalize_hand_helper this function normalizes the hand keypoints of a dataframe with respect to the minimum and maximum coordinates
|
||||||
|
|
||||||
|
:param dataframe: the dataframe to normalize
|
||||||
|
:type dataframe: pd.DataFrame
|
||||||
|
:param hand: the hand to normalize
|
||||||
|
:type hand: str
|
||||||
|
:return: the normalized dataframe
|
||||||
|
:rtype: pd.DataFrame
|
||||||
|
"""
|
||||||
|
# get all columns that belong to the hand (left hand column 66 - 107, right hand column 108 - 149)
|
||||||
|
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
|
||||||
|
|
||||||
|
# get the x, y coordinates of the hand keypoints
|
||||||
|
hand_coords = dataframe.iloc[:, hand_columns].values.reshape(-1, 21, 2)
|
||||||
|
|
||||||
|
# get the min and max x, y coordinates of the hand keypoints
|
||||||
|
min_x, min_y = np.min(hand_coords[:, :, 0], axis=1), np.min(hand_coords[:, :, 1], axis=1)
|
||||||
|
max_x, max_y = np.max(hand_coords[:, :, 0], axis=1), np.max(hand_coords[:, :, 1], axis=1)
|
||||||
|
|
||||||
|
# calculate the center of the hand keypoints
|
||||||
|
center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
|
||||||
|
|
||||||
|
# calculate the width and height of the bounding box around the hand keypoints
|
||||||
|
bbox_width, bbox_height = max_x - min_x, max_y - min_y
|
||||||
|
|
||||||
|
# repeat the center coordinates and bounding box dimensions to match the shape of hand_coords (numpy magic)
|
||||||
|
center_x, center_y = center_x.reshape(-1, 1, 1), center_y.reshape(-1, 1, 1)
|
||||||
|
center_coords = np.concatenate((np.tile(center_x, (1, 21, 1)), np.tile(center_y, (1, 21, 1))), axis=2)
|
||||||
|
|
||||||
|
bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
|
||||||
|
bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2)
|
||||||
|
|
||||||
|
if np.any(bbox_dims == 0):
|
||||||
|
return dataframe
|
||||||
|
# normalize the hand keypoints based on the bounding box around the hand
|
||||||
|
norm_hand_coords = (hand_coords - center_coords) / bbox_dims
|
||||||
|
|
||||||
|
# flatten the normalized hand keypoints array and replace the original hand keypoints with the normalized hand keypoints in the dataframe
|
||||||
|
dataframe.iloc[:, hand_columns] = norm_hand_coords.reshape(-1, 42)
|
||||||
|
|
||||||
|
return dataframe
|
||||||
|
|
||||||
|
def normalize_hand_bohacek(self, dataframe: pd.DataFrame, hand: str) -> pd.DataFrame:
|
||||||
|
"""normalize_hand_helper this function normalizes the hand keypoints of a dataframe using the bohacek normalization algorithm
|
||||||
|
|
||||||
|
:param dataframe: the dataframe to normalize
|
||||||
|
:type dataframe: pd.DataFrame
|
||||||
|
:param hand: the hand to normalize
|
||||||
|
:type hand: str
|
||||||
|
:return: the normalized dataframe
|
||||||
|
:rtype: pd.DataFrame
|
||||||
|
"""
|
||||||
|
# get all columns that belong to the hand (left hand column 66 - 107, right hand column 108 - 149)
|
||||||
|
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
|
||||||
|
|
||||||
|
# get the x, y coordinates of the hand keypoints
|
||||||
|
hand_coords = dataframe.iloc[:, hand_columns].values.reshape(-1, 21, 2)
|
||||||
|
|
||||||
|
# get the min and max x, y coordinates of the hand keypoints
|
||||||
|
min_x, min_y = np.min(hand_coords[:, :, 0], axis=1), np.min(hand_coords[:, :, 1], axis=1)
|
||||||
|
max_x, max_y = np.max(hand_coords[:, :, 0], axis=1), np.max(hand_coords[:, :, 1], axis=1)
|
||||||
|
|
||||||
|
# calculate the hand keypoint width and height (NOT the bounding box width and height!)
|
||||||
|
width, height = max_x - min_x, max_y - min_y
|
||||||
|
|
||||||
|
# initialize empty arrays for deltas
|
||||||
|
delta_x = np.zeros(width.shape, dtype='float64')
|
||||||
|
delta_y = np.zeros(height.shape, dtype='float64')
|
||||||
|
|
||||||
|
# calculate the deltas
|
||||||
|
mask = width>height
|
||||||
|
# width > height
|
||||||
|
delta_x[mask] = (0.1 * width)[mask]
|
||||||
|
delta_y[mask] = (delta_x + ((width - height) / 2))[mask]
|
||||||
|
# height >= width
|
||||||
|
delta_y[~mask] = (0.1 * height)[~mask]
|
||||||
|
delta_x[~mask] = (delta_y + ((height - width) / 2))[~mask]
|
||||||
|
|
||||||
|
# Set the starting and ending point of the normalization bounding box
|
||||||
|
starting_x, starting_y = min_x - delta_x, min_y - delta_y
|
||||||
|
ending_x, ending_y = max_x + delta_x, max_y + delta_y
|
||||||
|
|
||||||
|
# calculate the center of the bounding box and the bounding box dimensions
|
||||||
|
bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
|
||||||
|
bbox_width, bbox_height = ending_x - starting_x, ending_y - starting_y
|
||||||
|
|
||||||
|
# repeat the center coordinates and bounding box dimensions to match the shape of hand_coords
|
||||||
|
bbox_center_x, bbox_center_y = bbox_center_x.reshape(-1, 1, 1), bbox_center_y.reshape(-1, 1, 1)
|
||||||
|
center_coords = np.concatenate((np.tile(bbox_center_x, (1, 21, 1)), np.tile(bbox_center_y, (1, 21, 1))), axis=2)
|
||||||
|
|
||||||
|
bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
|
||||||
|
bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2)
|
||||||
|
|
||||||
|
if np.any(bbox_dims == 0):
|
||||||
|
return dataframe
|
||||||
|
# normalize the hand keypoints based on the bounding box around the hand
|
||||||
|
norm_hand_coords = (hand_coords - center_coords) / bbox_dims
|
||||||
|
|
||||||
|
# flatten the normalized hand keypoints array and replace the original hand keypoints with the normalized hand keypoints in the dataframe
|
||||||
|
dataframe.iloc[:, hand_columns] = norm_hand_coords.reshape(-1, 42)
|
||||||
|
|
||||||
|
return dataframe
|
||||||
67
src/train.py
67
src/train.py
@@ -13,10 +13,12 @@ import torch.optim as optim
|
|||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from torchvision import transforms
|
from torchvision import transforms
|
||||||
|
|
||||||
from datasets.wlasl_dataset import WLASLDataset
|
from src.augmentations import MirrorKeypoints
|
||||||
from identifiers import LANDMARKS
|
from src.datasets.finger_spelling_dataset import FingerSpellingDataset
|
||||||
from keypoint_extractor import KeypointExtractor
|
from src.datasets.wlasl_dataset import WLASLDataset
|
||||||
from model import SPOTER
|
from src.identifiers import LANDMARKS
|
||||||
|
from src.keypoint_extractor import KeypointExtractor
|
||||||
|
from src.model import SPOTER
|
||||||
|
|
||||||
|
|
||||||
def train():
|
def train():
|
||||||
@@ -32,30 +34,28 @@ def train():
|
|||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
spoter_model = SPOTER(num_classes=100, hidden_dim=len(LANDMARKS) *2)
|
spoter_model = SPOTER(num_classes=12, hidden_dim=len(LANDMARKS) *2)
|
||||||
spoter_model.train(True)
|
spoter_model.train(True)
|
||||||
spoter_model.to(device)
|
spoter_model.to(device)
|
||||||
|
|
||||||
criterion = nn.CrossEntropyLoss()
|
criterion = nn.CrossEntropyLoss()
|
||||||
optimizer = optim.SGD(spoter_model.parameters(), lr=0.001, momentum=0.9)
|
optimizer = optim.SGD(spoter_model.parameters(), lr=0.0001, momentum=0.9)
|
||||||
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5)
|
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5)
|
||||||
|
|
||||||
# TODO: create paths for checkpoints
|
# TODO: create paths for checkpoints
|
||||||
|
|
||||||
# TODO: transformations + augmentations
|
# TODO: transformations + augmentations
|
||||||
|
|
||||||
k = KeypointExtractor("data/videos/")
|
k = KeypointExtractor("data/fingerspelling/data/")
|
||||||
|
|
||||||
train_set = WLASLDataset("data/nslt_100.json", "data/missing.txt", k, keypoints_identifier=LANDMARKS, subset="train")
|
transform = transforms.Compose([MirrorKeypoints()])
|
||||||
|
|
||||||
|
train_set = FingerSpellingDataset("data/fingerspelling/data/", k, keypoints_identifier=LANDMARKS, subset="train", transform=transform)
|
||||||
train_loader = DataLoader(train_set, shuffle=True, generator=g)
|
train_loader = DataLoader(train_set, shuffle=True, generator=g)
|
||||||
|
|
||||||
val_set = WLASLDataset("data/nslt_100.json", "data/missing.txt", k, keypoints_identifier=LANDMARKS, subset="val")
|
val_set = FingerSpellingDataset("data/fingerspelling/data/", k, keypoints_identifier=LANDMARKS, subset="val")
|
||||||
val_loader = DataLoader(val_set, shuffle=True, generator=g)
|
val_loader = DataLoader(val_set, shuffle=True, generator=g)
|
||||||
|
|
||||||
test_set = WLASLDataset("data/nslt_100.json", "data/missing.txt", k, keypoints_identifier=LANDMARKS, subset="test")
|
|
||||||
test_loader = DataLoader(test_set, shuffle=True, generator=g)
|
|
||||||
|
|
||||||
|
|
||||||
train_acc, val_acc = 0, 0
|
train_acc, val_acc = 0, 0
|
||||||
lr_progress = []
|
lr_progress = []
|
||||||
top_train_acc, top_val_acc = 0, 0
|
top_train_acc, top_val_acc = 0, 0
|
||||||
@@ -81,32 +81,39 @@ def train():
|
|||||||
if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0]):
|
if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0]):
|
||||||
pred_correct += 1
|
pred_correct += 1
|
||||||
pred_all += 1
|
pred_all += 1
|
||||||
|
|
||||||
if i % 100 == 0:
|
|
||||||
print(f"Epoch: {epoch} | Batch: {i} | Loss: {running_loss.item()} | Train Acc: {(pred_correct / pred_all)}")
|
|
||||||
|
|
||||||
if scheduler:
|
if scheduler:
|
||||||
scheduler.step(running_loss.item() / len(train_loader))
|
scheduler.step(running_loss.item() / len(train_loader))
|
||||||
|
|
||||||
# validate
|
# validate and print val acc
|
||||||
|
val_pred_correct, val_pred_all = 0, 0
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
for i, (inputs, labels) in enumerate(val_loader):
|
for i, (inputs, labels) in enumerate(val_loader):
|
||||||
inputs = inputs.squeeze(0).to(device)
|
inputs = inputs.squeeze(0).to(device)
|
||||||
labels = labels.to(device)
|
labels = labels.to(device, dtype=torch.long)
|
||||||
|
|
||||||
outputs = spoter_model(inputs)
|
outputs = spoter_model(inputs).expand(1, -1, -1)
|
||||||
_, predicted = torch.max(outputs.data, 1)
|
|
||||||
val_acc = (predicted == labels).sum().item() / labels.size(0)
|
if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0]):
|
||||||
|
val_pred_correct += 1
|
||||||
|
val_pred_all += 1
|
||||||
|
|
||||||
|
val_acc = (val_pred_correct / val_pred_all)
|
||||||
|
|
||||||
|
print(f"Epoch: {epoch} | Train Acc: {(pred_correct / pred_all)} | Val Acc: {val_acc}")
|
||||||
|
|
||||||
|
|
||||||
# save checkpoint
|
# save checkpoint
|
||||||
# if val_acc > top_val_acc:
|
if val_acc > top_val_acc and epoch > 55:
|
||||||
# top_val_acc = val_acc
|
top_val_acc = val_acc
|
||||||
# top_train_acc = train_acc
|
top_train_acc = train_acc
|
||||||
# checkpoint_index = epoch
|
checkpoint_index = epoch
|
||||||
# torch.save(spoter_model.state_dict(), f"checkpoints/spoter_{epoch}.pth")
|
torch.save(spoter_model.state_dict(), f"checkpoints/spoter_{epoch}.pth")
|
||||||
|
|
||||||
print(f"Epoch: {epoch} | Train Acc: {train_acc} | Val Acc: {val_acc}")
|
|
||||||
lr_progress.append(optimizer.param_groups[0]['lr'])
|
lr_progress.append(optimizer.param_groups[0]['lr'])
|
||||||
|
|
||||||
|
print(f"Best val acc: {top_val_acc} | Best train acc: {top_train_acc} | Epoch: {checkpoint_index}")
|
||||||
|
|
||||||
train()
|
# Path: src/train.py
|
||||||
|
if __name__ == "__main__":
|
||||||
|
train()
|
||||||
232
visualize_data.ipynb
Normal file
232
visualize_data.ipynb
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from src.keypoint_extractor import KeypointExtractor\n",
|
||||||
|
"\n",
|
||||||
|
"# reload modules\n",
|
||||||
|
"%load_ext autoreload"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"video_name = '69547.mp4' "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# extract keypoints\n",
|
||||||
|
"keypoint_extractor = KeypointExtractor('data/videos/')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from IPython.display import HTML\n",
|
||||||
|
"from base64 import b64encode\n",
|
||||||
|
"import mediapy as media\n",
|
||||||
|
"%matplotlib inline\n",
|
||||||
|
"\n",
|
||||||
|
"# Define the frames per second (fps) and duration of the video\n",
|
||||||
|
"fps = 25\n",
|
||||||
|
"duration = 10\n",
|
||||||
|
"\n",
|
||||||
|
"# Create a dummy video of random noise\n",
|
||||||
|
"_, video_frames = keypoint_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\", draw=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# Convert the video to a numpy array\n",
|
||||||
|
"video = np.array(video_frames)\n",
|
||||||
|
"media.show_video(video, height=400, codec='gif', fps=4)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from src.model import SPOTER\n",
|
||||||
|
"from src.identifiers import LANDMARKS\n",
|
||||||
|
"import torch\n",
|
||||||
|
"\n",
|
||||||
|
"spoter_model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2)\n",
|
||||||
|
"spoter_model.load_state_dict(torch.load('models/spoter_40.pth'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# get average number of frames in test set\n",
|
||||||
|
"from src.keypoint_extractor import KeypointExtractor\n",
|
||||||
|
"from src.datasets.finger_spelling_dataset import FingerSpellingDataset\n",
|
||||||
|
"from src.identifiers import LANDMARKS\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"keypoints_extractor = KeypointExtractor(\"data/fingerspelling/data/\")\n",
|
||||||
|
"test_set = FingerSpellingDataset(\"data/fingerspelling/data/\", keypoints_extractor, keypoints_identifier=LANDMARKS, subset=\"val\")\n",
|
||||||
|
"\n",
|
||||||
|
"frames = []\n",
|
||||||
|
"labels = []\n",
|
||||||
|
"for sample, label in test_set:\n",
|
||||||
|
" frames.append(sample.shape[0])\n",
|
||||||
|
" labels.append(label)\n",
|
||||||
|
"\n",
|
||||||
|
"print(np.mean(frames))\n",
|
||||||
|
"# get label frequency in the labels list\n",
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"\n",
|
||||||
|
"counter = Counter(labels)\n",
|
||||||
|
"print(counter)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Hand keypoint visualization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"\n",
|
||||||
|
"def plot_hand_keypoints(dataframe, hand, frame):\n",
|
||||||
|
" hand_columns = np.array([i for i in range(66 + (42 if hand == \"right\" else 0), 108 + (42 if hand == \"right\" else 0))])\n",
|
||||||
|
" \n",
|
||||||
|
" # get the x, y coordinates of the hand keypoints\n",
|
||||||
|
" frame_df = dataframe.iloc[frame:frame+1, hand_columns]\n",
|
||||||
|
" hand_coords = frame_df.values.reshape(21, 2)\n",
|
||||||
|
" \n",
|
||||||
|
" x_coords = hand_coords[:, ::2] #Even indices\n",
|
||||||
|
" y_coords = hand_coords[:, 1::2] #Uneven indices\n",
|
||||||
|
" \n",
|
||||||
|
" #Plot the keypoints\n",
|
||||||
|
" plt.scatter(x_coords, y_coords)\n",
|
||||||
|
" return frame_df.style"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#Set video, hand and frame to display\n",
|
||||||
|
"video_name = '69547.mp4'\n",
|
||||||
|
"hand = \"right\"\n",
|
||||||
|
"frame = 3\n",
|
||||||
|
"%reload_ext autoreload"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from src.keypoint_extractor import KeypointExtractor\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"#Extract keypoints from requested video\n",
|
||||||
|
"keypoints_extractor = KeypointExtractor(\"data/videos/\")\n",
|
||||||
|
"\n",
|
||||||
|
"#Plot the hand keypoints\n",
|
||||||
|
"df = keypoints_extractor.extract_keypoints_from_video(video_name)\n",
|
||||||
|
"df.head()\n",
|
||||||
|
"plot_hand_keypoints(df, hand, frame)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#Plot the NORMALIZED hand keypoints (using minxmax)\n",
|
||||||
|
"df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\")\n",
|
||||||
|
"plot_hand_keypoints(df, hand, frame)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#Plot the NORMALIZED hand keypoints (using bohacek)\n",
|
||||||
|
"df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"bohacek\")\n",
|
||||||
|
"plot_hand_keypoints(df, hand, frame)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.16"
|
||||||
|
},
|
||||||
|
"vscode": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
167
webcam_view.py
Normal file
167
webcam_view.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
import cv2
|
||||||
|
import mediapipe as mp
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from src.identifiers import LANDMARKS
|
||||||
|
from src.model import SPOTER
|
||||||
|
|
||||||
|
# Initialize MediaPipe Hands model
|
||||||
|
holistic = mp.solutions.holistic.Holistic(
|
||||||
|
min_detection_confidence=0.5,
|
||||||
|
min_tracking_confidence=0.5,
|
||||||
|
model_complexity=2
|
||||||
|
)
|
||||||
|
mp_holistic = mp.solutions.holistic
|
||||||
|
mp_drawing = mp.solutions.drawing_utils
|
||||||
|
# Initialize video capture object
|
||||||
|
cap = cv2.VideoCapture(0)
|
||||||
|
|
||||||
|
|
||||||
|
keypoints = []
|
||||||
|
|
||||||
|
spoter_model = SPOTER(num_classes=12, hidden_dim=len(LANDMARKS) *2)
|
||||||
|
spoter_model.load_state_dict(torch.load('models/spoter_57.pth'))
|
||||||
|
|
||||||
|
m = {
|
||||||
|
0: "A",
|
||||||
|
1: "B",
|
||||||
|
2: "C",
|
||||||
|
3: "D",
|
||||||
|
4: "E",
|
||||||
|
5: "F",
|
||||||
|
6: "G",
|
||||||
|
7: "H",
|
||||||
|
8: "I",
|
||||||
|
9: "J",
|
||||||
|
10: "K",
|
||||||
|
11: "L",
|
||||||
|
}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Read a frame from the webcam
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Convert the frame to RGB
|
||||||
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||||
|
|
||||||
|
# Detect hand landmarks in the frame
|
||||||
|
results = holistic.process(frame)
|
||||||
|
|
||||||
|
def extract_keypoints(landmarks):
|
||||||
|
if landmarks:
|
||||||
|
return [i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]]
|
||||||
|
|
||||||
|
k1 = extract_keypoints(results.pose_landmarks)
|
||||||
|
k2 = extract_keypoints(results.left_hand_landmarks)
|
||||||
|
k3 = extract_keypoints(results.right_hand_landmarks)
|
||||||
|
|
||||||
|
if k1 and (k2 or k3):
|
||||||
|
data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])
|
||||||
|
|
||||||
|
def normalize_hand(frame, data, hand, algorithm="minmax"):
|
||||||
|
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
|
||||||
|
hand_data = np.array(data[0])[hand_columns]
|
||||||
|
|
||||||
|
# convert to absolute pixels
|
||||||
|
hand_data = hand_data.reshape(21, 2)
|
||||||
|
hand_data[:, 0] *= frame.shape[1]
|
||||||
|
hand_data[:, 1] *= frame.shape[0]
|
||||||
|
|
||||||
|
min_x, min_y = np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
|
||||||
|
max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])
|
||||||
|
|
||||||
|
width, height = max_x - min_x, max_y - min_y
|
||||||
|
|
||||||
|
if algorithm == "minmax":
|
||||||
|
bbox_height, bbox_width = height, width
|
||||||
|
center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
|
||||||
|
|
||||||
|
starting_x, starting_y = min_x, min_y
|
||||||
|
ending_x, ending_y = max_x, max_y
|
||||||
|
|
||||||
|
elif algorithm == "bohacek":
|
||||||
|
if width > height:
|
||||||
|
delta_x = 0.1 * width
|
||||||
|
delta_y = delta_x + ((width - height) / 2)
|
||||||
|
else:
|
||||||
|
delta_y = 0.1 * height
|
||||||
|
delta_x = delta_y + ((height - width) / 2)
|
||||||
|
|
||||||
|
starting_x, starting_y = min_x - delta_x, min_y - delta_y
|
||||||
|
ending_x, ending_y = max_x + delta_x, max_y + delta_y
|
||||||
|
|
||||||
|
center_x, center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
|
||||||
|
bbox_height, bbox_width = ending_y - starting_y, ending_x - starting_x
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("Not a valid normalization algorithm")
|
||||||
|
return data, frame
|
||||||
|
|
||||||
|
if bbox_height == 0 or bbox_width == 0:
|
||||||
|
return data, frame
|
||||||
|
|
||||||
|
center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)
|
||||||
|
bbox_dims = np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)
|
||||||
|
|
||||||
|
hand_data = (hand_data - center_coords) / bbox_dims
|
||||||
|
|
||||||
|
# add bouding box to frame
|
||||||
|
frame = cv2.rectangle(frame, (int(starting_x), int(starting_y)), (int(ending_x), int(ending_y)), (0, 255, 0), 2)
|
||||||
|
|
||||||
|
data[:, hand_columns] = hand_data.reshape(-1, 42)
|
||||||
|
return data, frame
|
||||||
|
|
||||||
|
norm_alg = "minmax"
|
||||||
|
|
||||||
|
data, frame = normalize_hand(frame, data, "left_hand", norm_alg)
|
||||||
|
data, frame = normalize_hand(frame, data, "right_hand", norm_alg)
|
||||||
|
|
||||||
|
# get values of the landmarks as a list of integers
|
||||||
|
values = []
|
||||||
|
for i in LANDMARKS.values():
|
||||||
|
values.append(i*2)
|
||||||
|
values.append(i*2+1)
|
||||||
|
filtered = np.array(data[0])[np.array(values)]
|
||||||
|
|
||||||
|
while len(keypoints) >= 8:
|
||||||
|
keypoints.pop(0)
|
||||||
|
keypoints.append(filtered)
|
||||||
|
|
||||||
|
if len(keypoints) == 8:
|
||||||
|
# keypoints to tensor
|
||||||
|
keypoints_tensor = torch.tensor(keypoints).float()
|
||||||
|
|
||||||
|
# predict
|
||||||
|
outputs = spoter_model(keypoints_tensor).expand(1, -1, -1)
|
||||||
|
|
||||||
|
# softmax
|
||||||
|
outputs = torch.nn.functional.softmax(outputs, dim=2)
|
||||||
|
|
||||||
|
# get topk predictions
|
||||||
|
topk = torch.topk(outputs, k=3, dim=2)
|
||||||
|
|
||||||
|
# show overlay on frame at top right with confidence scores of topk predictions
|
||||||
|
for i, (label, score) in enumerate(zip(topk.indices[0][0], topk.values[0][0])):
|
||||||
|
cv2.putText(frame, f"{m[label.item()]} {score.item():.2f}", (frame.shape[1] - 200, 50 + i * 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
|
||||||
|
|
||||||
|
|
||||||
|
mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
|
||||||
|
mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
|
||||||
|
mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
|
||||||
|
|
||||||
|
# frame to rgb
|
||||||
|
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
|
||||||
|
|
||||||
|
# Show the frame
|
||||||
|
cv2.imshow('MediaPipe Hands', frame)
|
||||||
|
|
||||||
|
# Wait for key press to exit
|
||||||
|
if cv2.waitKey(5) & 0xFF == 27:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Release the video capture object and destroy the windows
|
||||||
|
cap.release()
|
||||||
|
cv2.destroyAllWindows()
|
||||||
Reference in New Issue
Block a user