Initial codebase (#1)

* Add project code * Logger improvements * Improvements to web demo code * added create_wlasl_landmarks_dataset.py and xtract_mediapipe_landmarks.py * Fix rotation augmentation * fixed error in docstring, and removed unnecessary replace -1 -> 0 * Readme updates * Share base notebooks * Add notebooks and unify for different datasets * requirements update * fixes * Make evaluate more deterministic * Allow training with clearml * refactor preprocessing and apply linter * Minor fixes * Minor notebook tweaks * Readme updates * Fix PR comments * Remove unneeded code * Add banner to Readme --------- Co-authored-by: Gabriel Lema <gabriel.lema@xmartlabs.com>
2023-03-03 10:07:54 -03:00
parent 661e4bbc03
commit 81bbf66aab
49 changed files with 4205 additions and 0 deletions
--- a/normalization/blazepose_mapping.py
+++ b/normalization/blazepose_mapping.py
@@ -0,0 +1,92 @@
+
+_BODY_KEYPOINT_MAPPING = {
+    "nose": "nose",
+    "left_eye": "leftEye",
+    "right_eye": "rightEye",
+    "left_ear": "leftEar",
+    "right_ear": "rightEar",
+    "left_shoulder": "leftShoulder",
+    "right_shoulder": "rightShoulder",
+    "left_elbow": "leftElbow",
+    "right_elbow": "rightElbow",
+    "left_wrist": "leftWrist",
+    "right_wrist": "rightWrist"
+}
+
+_HAND_KEYPOINT_MAPPING = {
+    "wrist": "wrist",
+    "index_finger_tip": "indexTip",
+    "index_finger_dip": "indexDIP",
+    "index_finger_pip": "indexPIP",
+    "index_finger_mcp": "indexMCP",
+    "middle_finger_tip": "middleTip",
+    "middle_finger_dip": "middleDIP",
+    "middle_finger_pip": "middlePIP",
+    "middle_finger_mcp": "middleMCP",
+    "ring_finger_tip": "ringTip",
+    "ring_finger_dip": "ringDIP",
+    "ring_finger_pip": "ringPIP",
+    "ring_finger_mcp": "ringMCP",
+    "pinky_tip": "littleTip",
+    "pinky_dip": "littleDIP",
+    "pinky_pip": "littlePIP",
+    "pinky_mcp": "littleMCP",
+    "thumb_tip": "thumbTip",
+    "thumb_ip": "thumbIP",
+    "thumb_mcp": "thumbMP",
+    "thumb_cmc": "thumbCMC"
+}
+
+
+def map_blazepose_keypoint(column):
+    #  Remove _x, _y suffixes
+    suffix = column[-2:].upper()
+    column = column[:-2]
+
+    if column.startswith("left_hand_"):
+        hand = "left"
+        finger_name = column[10:]
+    elif column.startswith("right_hand_"):
+        hand = "right"
+        finger_name = column[11:]
+    else:
+        if column not in _BODY_KEYPOINT_MAPPING:
+            return None
+        mapped = _BODY_KEYPOINT_MAPPING[column]
+        return mapped + suffix
+
+    if finger_name not in _HAND_KEYPOINT_MAPPING:
+        return None
+    mapped = _HAND_KEYPOINT_MAPPING[finger_name]
+    return f"{mapped}_{hand}{suffix}"
+
+
+def map_blazepose_df(df):
+    to_drop = []
+    renamings = {}
+    for column in df.columns:
+        mapped_column = map_blazepose_keypoint(column)
+        if mapped_column:
+            renamings[column] = mapped_column
+        else:
+            to_drop.append(column)
+    df = df.rename(columns=renamings)
+
+    for index, row in df.iterrows():
+
+        sequence_size = len(row["leftEar_Y"])
+        lsx = row["leftShoulder_X"]
+        rsx = row["rightShoulder_X"]
+        lsy = row["leftShoulder_Y"]
+        rsy = row["rightShoulder_Y"]
+        neck_x = []
+        neck_y = []
+        # Treat each element of the sequence (analyzed frame) individually
+        for sequence_index in range(sequence_size):
+            neck_x.append((float(lsx[sequence_index]) + float(rsx[sequence_index])) / 2)
+            neck_y.append((float(lsy[sequence_index]) + float(rsy[sequence_index])) / 2)
+        df.loc[index, "neck_X"] = str(neck_x)
+        df.loc[index, "neck_Y"] = str(neck_y)
+
+    df.drop(columns=to_drop, inplace=True)
+    return df
--- a/normalization/body_normalization.py
+++ b/normalization/body_normalization.py
@@ -0,0 +1,241 @@
+
+from typing import Tuple
+import pandas as pd
+from utils import get_logger
+
+
+BODY_IDENTIFIERS = [
+    "nose",
+    "neck",
+    "rightEye",
+    "leftEye",
+    "rightEar",
+    "leftEar",
+    "rightShoulder",
+    "leftShoulder",
+    "rightElbow",
+    "leftElbow",
+    "rightWrist",
+    "leftWrist"
+]
+
+
+def normalize_body_full(df: pd.DataFrame) -> Tuple[pd.DataFrame, list]:
+    """
+    Normalizes the body position data using the Bohacek-normalization algorithm.
+
+    :param df: pd.DataFrame to be normalized
+    :return: pd.DataFrame with normalized values for body pose
+    """
+    logger = get_logger(__name__)
+
+    # TODO: Fix division by zero
+
+    normalized_df = pd.DataFrame(columns=df.columns)
+    invalid_row_indexes = []
+    body_landmarks = {"X": [], "Y": []}
+
+    # Construct the relevant identifiers
+    for identifier in BODY_IDENTIFIERS:
+        body_landmarks["X"].append(identifier + "_X")
+        body_landmarks["Y"].append(identifier + "_Y")
+
+    # Iterate over all of the records in the dataset
+    for index, row in df.iterrows():
+
+        sequence_size = len(row["leftEar_Y"])
+        valid_sequence = True
+        original_row = row
+
+        last_starting_point, last_ending_point = None, None
+
+        # Treat each element of the sequence (analyzed frame) individually
+        for sequence_index in range(sequence_size):
+
+            # Prevent from even starting the analysis if some necessary elements are not present
+            if (row["leftShoulder_X"][sequence_index] == 0 or row["rightShoulder_X"][sequence_index] == 0) and \
+                    (row["neck_X"][sequence_index] == 0 or row["nose_X"][sequence_index] == 0):
+                if not last_starting_point:
+                    valid_sequence = False
+                    continue
+
+                else:
+                    starting_point, ending_point = last_starting_point, last_ending_point
+
+            else:
+
+                # NOTE:
+                #
+                # While in the paper, it is written that the head metric is calculated by halving the shoulder distance,
+                # this is meant for the distance between the very ends of one's shoulder, as literature studying body
+                # metrics and ratios generally states. The Vision Pose Estimation API, however, seems to be predicting
+                # rather the center of one's shoulder. Based on our experiments and manual reviews of the data,
+                # employing
+                # this as just the plain shoulder distance seems to be more corresponding to the desired metric.
+                #
+                # Please, review this if using other third-party pose estimation libraries.
+
+                if row["leftShoulder_X"][sequence_index] != 0 and row["rightShoulder_X"][sequence_index] != 0:
+                    left_shoulder = (row["leftShoulder_X"][sequence_index], row["leftShoulder_Y"][sequence_index])
+                    right_shoulder = (row["rightShoulder_X"][sequence_index], row["rightShoulder_Y"][sequence_index])
+                    shoulder_distance = ((((left_shoulder[0] - right_shoulder[0]) ** 2) + (
+                        (left_shoulder[1] - right_shoulder[1]) ** 2)) ** 0.5)
+                    head_metric = shoulder_distance
+                else:
+                    neck = (row["neck_X"][sequence_index], row["neck_Y"][sequence_index])
+                    nose = (row["nose_X"][sequence_index], row["nose_Y"][sequence_index])
+                    neck_nose_distance = ((((neck[0] - nose[0]) ** 2) + ((neck[1] - nose[1]) ** 2)) ** 0.5)
+                    head_metric = neck_nose_distance
+
+                # Set the starting and ending point of the normalization bounding box
+                starting_point = [row["neck_X"][sequence_index] - 3 * head_metric,
+                                  row["leftEye_Y"][sequence_index] + (head_metric / 2)]
+                ending_point = [row["neck_X"][sequence_index] + 3 * head_metric, starting_point[1] - 6 * head_metric]
+
+                last_starting_point, last_ending_point = starting_point, ending_point
+
+            # Ensure that all of the bounding-box-defining coordinates are not out of the picture
+            if starting_point[0] < 0:
+                starting_point[0] = 0
+            if starting_point[1] < 0:
+                starting_point[1] = 0
+            if ending_point[0] < 0:
+                ending_point[0] = 0
+            if ending_point[1] < 0:
+                ending_point[1] = 0
+
+            # Normalize individual landmarks and save the results
+            for identifier in BODY_IDENTIFIERS:
+                key = identifier + "_"
+
+                # Prevent from trying to normalize incorrectly captured points
+                if row[key + "X"][sequence_index] == 0:
+                    continue
+
+                normalized_x = (row[key + "X"][sequence_index] - starting_point[0]) / (ending_point[0] -
+                                                                                       starting_point[0])
+                normalized_y = (row[key + "Y"][sequence_index] - ending_point[1]) / (starting_point[1] -
+                                                                                     ending_point[1])
+
+                row[key + "X"][sequence_index] = normalized_x
+                row[key + "Y"][sequence_index] = normalized_y
+
+        if valid_sequence:
+            normalized_df = normalized_df.append(row, ignore_index=True)
+        else:
+            logger.warning(" BODY LANDMARKS: One video instance could not be normalized.")
+            normalized_df = normalized_df.append(original_row, ignore_index=True)
+            invalid_row_indexes.append(index)
+
+    logger.info("The normalization of body is finished.")
+    logger.info("\t-> Original size:", df.shape[0])
+    logger.info("\t-> Normalized size:", normalized_df.shape[0])
+    logger.info("\t-> Problematic videos:", len(invalid_row_indexes))
+
+    return normalized_df, invalid_row_indexes
+
+
+def normalize_single_dict(row: dict):
+    """
+    Normalizes the skeletal data for a given sequence of frames with signer's body pose data. The normalization follows
+    the definition from our paper.
+
+    :param row: Dictionary containing key-value pairs with joint identifiers and corresponding lists (sequences) of
+                that particular joints coordinates
+    :return: Dictionary with normalized skeletal data (following the same schema as input data)
+    """
+
+    sequence_size = len(row["leftEar"])
+    valid_sequence = True
+    original_row = row
+    logger = get_logger(__name__)
+
+    last_starting_point, last_ending_point = None, None
+
+    # Treat each element of the sequence (analyzed frame) individually
+    for sequence_index in range(sequence_size):
+        left_shoulder = (row["leftShoulder"][sequence_index][0], row["leftShoulder"][sequence_index][1])
+        right_shoulder = (row["rightShoulder"][sequence_index][0], row["rightShoulder"][sequence_index][1])
+        neck = (row["neck"][sequence_index][0], row["neck"][sequence_index][1])
+        nose = (row["nose"][sequence_index][0], row["nose"][sequence_index][1])
+        # Prevent from even starting the analysis if some necessary elements are not present
+        if (left_shoulder[0] == 0 or right_shoulder[0] == 0
+            or (left_shoulder[0] == right_shoulder[0] and left_shoulder[1] == right_shoulder[1])) and (
+                neck[0] == 0 or nose[0] == 0 or (neck[0] == nose[0] and neck[1] == nose[1])):
+            if not last_starting_point:
+                valid_sequence = False
+                continue
+
+            else:
+                starting_point, ending_point = last_starting_point, last_ending_point
+
+        else:
+
+            # NOTE:
+            #
+            # While in the paper, it is written that the head metric is calculated by halving the shoulder distance,
+            # this is meant for the distance between the very ends of one's shoulder, as literature studying body
+            # metrics and ratios generally states. The Vision Pose Estimation API, however, seems to be predicting
+            # rather the center of one's shoulder. Based on our experiments and manual reviews of the data, employing
+            # this as just the plain shoulder distance seems to be more corresponding to the desired metric.
+            #
+            # Please, review this if using other third-party pose estimation libraries.
+
+            if left_shoulder[0] != 0 and right_shoulder[0] != 0 and \
+                    (left_shoulder[0] != right_shoulder[0] or left_shoulder[1] != right_shoulder[1]):
+                shoulder_distance = ((((left_shoulder[0] - right_shoulder[0]) ** 2) + (
+                    (left_shoulder[1] - right_shoulder[1]) ** 2)) ** 0.5)
+                head_metric = shoulder_distance
+            else:
+                neck_nose_distance = ((((neck[0] - nose[0]) ** 2) + ((neck[1] - nose[1]) ** 2)) ** 0.5)
+                head_metric = neck_nose_distance
+
+            # Set the starting and ending point of the normalization bounding box
+            # starting_point = [row["neck"][sequence_index][0] - 3 * head_metric,
+            #                  row["leftEye"][sequence_index][1] + (head_metric / 2)]
+            starting_point = [row["neck"][sequence_index][0] - 3 * head_metric,
+                              row["leftEye"][sequence_index][1] + head_metric]
+            ending_point = [row["neck"][sequence_index][0] + 3 * head_metric, starting_point[1] - 6 * head_metric]
+
+            last_starting_point, last_ending_point = starting_point, ending_point
+
+        # Ensure that all of the bounding-box-defining coordinates are not out of the picture
+        if starting_point[0] < 0:
+            starting_point[0] = 0
+        if starting_point[1] < 0:
+            starting_point[1] = 0
+        if ending_point[0] < 0:
+            ending_point[0] = 0
+        if ending_point[1] < 0:
+            ending_point[1] = 0
+
+        # Normalize individual landmarks and save the results
+        for identifier in BODY_IDENTIFIERS:
+            key = identifier
+
+            # Prevent from trying to normalize incorrectly captured points
+            if row[key][sequence_index][0] == 0:
+                continue
+
+            if (ending_point[0] - starting_point[0]) == 0 or (starting_point[1] - ending_point[1]) == 0:
+                logger.warning("Problematic normalization")
+                valid_sequence = False
+                break
+
+            normalized_x = (row[key][sequence_index][0] - starting_point[0]) / (ending_point[0] - starting_point[0])
+            normalized_y = (row[key][sequence_index][1] - ending_point[1]) / (starting_point[1] - ending_point[1])
+
+            row[key][sequence_index] = list(row[key][sequence_index])
+
+            row[key][sequence_index][0] = normalized_x
+            row[key][sequence_index][1] = normalized_y
+
+    if valid_sequence:
+        return row
+
+    else:
+        return original_row
+
+
+if __name__ == "__main__":
+    pass
--- a/normalization/hand_normalization.py
+++ b/normalization/hand_normalization.py
@@ -0,0 +1,195 @@
+
+import pandas as pd
+from utils import get_logger
+
+
+HAND_IDENTIFIERS = [
+    "wrist",
+    "indexTip",
+    "indexDIP",
+    "indexPIP",
+    "indexMCP",
+    "middleTip",
+    "middleDIP",
+    "middlePIP",
+    "middleMCP",
+    "ringTip",
+    "ringDIP",
+    "ringPIP",
+    "ringMCP",
+    "littleTip",
+    "littleDIP",
+    "littlePIP",
+    "littleMCP",
+    "thumbTip",
+    "thumbIP",
+    "thumbMP",
+    "thumbCMC"
+]
+
+
+def normalize_hands_full(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Normalizes the hands position data using the Bohacek-normalization algorithm.
+
+    :param df: pd.DataFrame to be normalized
+    :return: pd.DataFrame with normalized values for hand pose
+    """
+
+    logger = get_logger(__name__)
+    # TODO: Fix division by zero
+    df.columns = [item.replace("_left_", "_0_").replace("_right_", "_1_") for item in list(df.columns)]
+
+    normalized_df = pd.DataFrame(columns=df.columns)
+
+    hand_landmarks = {"X": {0: [], 1: []}, "Y": {0: [], 1: []}}
+
+    # Determine how many hands are present in the dataset
+    range_hand_size = 1
+    if "wrist_1_X" in df.columns:
+        range_hand_size = 2
+
+    # Construct the relevant identifiers
+    for identifier in HAND_IDENTIFIERS:
+        for hand_index in range(range_hand_size):
+            hand_landmarks["X"][hand_index].append(identifier + "_" + str(hand_index) + "_X")
+            hand_landmarks["Y"][hand_index].append(identifier + "_" + str(hand_index) + "_Y")
+
+    # Iterate over all of the records in the dataset
+    for index, row in df.iterrows():
+        # Treat each hand individually
+        for hand_index in range(range_hand_size):
+
+            sequence_size = len(row["wrist_" + str(hand_index) + "_X"])
+
+            # Treat each element of the sequence (analyzed frame) individually
+            for sequence_index in range(sequence_size):
+
+                # Retrieve all of the X and Y values of the current frame
+                landmarks_x_values = [row[key][sequence_index]
+                                      for key in hand_landmarks["X"][hand_index] if row[key][sequence_index] != 0]
+                landmarks_y_values = [row[key][sequence_index]
+                                      for key in hand_landmarks["Y"][hand_index] if row[key][sequence_index] != 0]
+
+                # Prevent from even starting the analysis if some necessary elements are not present
+                if not landmarks_x_values or not landmarks_y_values:
+                    logger.warning(
+                        " HAND LANDMARKS: One frame could not be normalized as there is no data present. Record: " +
+                        str(index) +
+                        ", Frame: " + str(sequence_index))
+                    continue
+
+                # Calculate the deltas
+                width, height = max(landmarks_x_values) - min(landmarks_x_values), max(landmarks_y_values) - min(
+                    landmarks_y_values)
+                if width > height:
+                    delta_x = 0.1 * width
+                    delta_y = delta_x + ((width - height) / 2)
+                else:
+                    delta_y = 0.1 * height
+                    delta_x = delta_y + ((height - width) / 2)
+
+                # Set the starting and ending point of the normalization bounding box
+                starting_point = (min(landmarks_x_values) - delta_x, min(landmarks_y_values) - delta_y)
+                ending_point = (max(landmarks_x_values) + delta_x, max(landmarks_y_values) + delta_y)
+
+                # Normalize individual landmarks and save the results
+                for identifier in HAND_IDENTIFIERS:
+                    key = identifier + "_" + str(hand_index) + "_"
+
+                    # Prevent from trying to normalize incorrectly captured points
+                    if row[key + "X"][sequence_index] == 0 or (ending_point[0] - starting_point[0]) == 0 or \
+                            (starting_point[1] - ending_point[1]) == 0:
+                        continue
+
+                    normalized_x = (row[key + "X"][sequence_index] - starting_point[0]) / (ending_point[0] -
+                                                                                           starting_point[0])
+                    normalized_y = (row[key + "Y"][sequence_index] - ending_point[1]) / (starting_point[1] -
+                                                                                         ending_point[1])
+
+                    row[key + "X"][sequence_index] = normalized_x
+                    row[key + "Y"][sequence_index] = normalized_y
+
+        normalized_df = normalized_df.append(row, ignore_index=True)
+
+    return normalized_df
+
+
+def normalize_single_dict(row: dict):
+    """
+    Normalizes the skeletal data for a given sequence of frames with signer's hand pose data. The normalization follows
+    the definition from our paper.
+
+    :param row: Dictionary containing key-value pairs with joint identifiers and corresponding lists (sequences) of
+                that particular joints coordinates
+    :return: Dictionary with normalized skeletal data (following the same schema as input data)
+    """
+
+    hand_landmarks = {0: [], 1: []}
+
+    # Determine how many hands are present in the dataset
+    range_hand_size = 1
+    if "wrist_1" in row.keys():
+        range_hand_size = 2
+
+    # Construct the relevant identifiers
+    for identifier in HAND_IDENTIFIERS:
+        for hand_index in range(range_hand_size):
+            hand_landmarks[hand_index].append(identifier + "_" + str(hand_index))
+
+    # Treat each hand individually
+    for hand_index in range(range_hand_size):
+
+        sequence_size = len(row["wrist_" + str(hand_index)])
+
+        # Treat each element of the sequence (analyzed frame) individually
+        for sequence_index in range(sequence_size):
+
+            # Retrieve all of the X and Y values of the current frame
+            landmarks_x_values = [row[key][sequence_index][0] for key in hand_landmarks[hand_index] if
+                                  row[key][sequence_index][0] != 0]
+            landmarks_y_values = [row[key][sequence_index][1] for key in hand_landmarks[hand_index] if
+                                  row[key][sequence_index][1] != 0]
+
+            # Prevent from even starting the analysis if some necessary elements are not present
+            if not landmarks_x_values or not landmarks_y_values:
+                continue
+
+            # Calculate the deltas
+            width, height = max(landmarks_x_values) - min(landmarks_x_values), max(landmarks_y_values) - min(
+                landmarks_y_values)
+            if width > height:
+                delta_x = 0.1 * width
+                delta_y = delta_x + ((width - height) / 2)
+            else:
+                delta_y = 0.1 * height
+                delta_x = delta_y + ((height - width) / 2)
+
+            # Set the starting and ending point of the normalization bounding box
+            starting_point = (min(landmarks_x_values) - delta_x, min(landmarks_y_values) - delta_y)
+            ending_point = (max(landmarks_x_values) + delta_x, max(landmarks_y_values) + delta_y)
+
+            # Normalize individual landmarks and save the results
+            for identifier in HAND_IDENTIFIERS:
+                key = identifier + "_" + str(hand_index)
+
+                # Prevent from trying to normalize incorrectly captured points
+                if row[key][sequence_index][0] == 0 or (ending_point[0] - starting_point[0]) == 0 or (
+                        starting_point[1] - ending_point[1]) == 0:
+                    continue
+
+                normalized_x = (row[key][sequence_index][0] - starting_point[0]) / (ending_point[0] -
+                                                                                    starting_point[0])
+                normalized_y = (row[key][sequence_index][1] - starting_point[1]) / (ending_point[1] -
+                                                                                    starting_point[1])
+
+                row[key][sequence_index] = list(row[key][sequence_index])
+
+                row[key][sequence_index][0] = normalized_x
+                row[key][sequence_index][1] = normalized_y
+
+    return row
+
+
+if __name__ == "__main__":
+    pass
--- a/normalization/main.py
+++ b/normalization/main.py
@@ -0,0 +1,47 @@
+import os
+import ast
+import pandas as pd
+
+from normalization.hand_normalization import normalize_hands_full
+from normalization.body_normalization import normalize_body_full
+
+DATASET_PATH = './data'
+# Load the dataset
+df = pd.read_csv(os.path.join(DATASET_PATH, "WLASL_test_15fps.csv"), encoding="utf-8")
+
+# Retrieve metadata
+video_size_heights = df["video_size_height"].to_list()
+video_size_widths = df["video_size_width"].to_list()
+
+# Delete redundant (non-related) properties
+del df["video_size_height"]
+del df["video_size_width"]
+
+# Temporarily remove other relevant metadata
+labels = df["labels"].to_list()
+video_fps = df["video_fps"].to_list()
+del df["labels"]
+del df["video_fps"]
+
+# Convert the strings into lists
+
+
+def convert(x): return ast.literal_eval(str(x))
+
+
+for column in df.columns:
+    df[column] = df[column].apply(convert)
+
+# Perform the normalizations
+df = normalize_hands_full(df)
+df, invalid_row_indexes = normalize_body_full(df)
+
+# Clear lists of items from deleted rows
+# labels = [t for i, t in enumerate(labels) if i not in invalid_row_indexes]
+# video_fps = [t for i, t in enumerate(video_fps) if i not in invalid_row_indexes]
+
+# Return the metadata back to the dataset
+df["labels"] = labels
+df["video_fps"] = video_fps
+
+df.to_csv(os.path.join(DATASET_PATH, "WLASL_test_15fps_normalized.csv"), encoding="utf-8", index=False)