Initial codebase (#1)
* Add project code * Logger improvements * Improvements to web demo code * added create_wlasl_landmarks_dataset.py and xtract_mediapipe_landmarks.py * Fix rotation augmentation * fixed error in docstring, and removed unnecessary replace -1 -> 0 * Readme updates * Share base notebooks * Add notebooks and unify for different datasets * requirements update * fixes * Make evaluate more deterministic * Allow training with clearml * refactor preprocessing and apply linter * Minor fixes * Minor notebook tweaks * Readme updates * Fix PR comments * Remove unneeded code * Add banner to Readme --------- Co-authored-by: Gabriel Lema <gabriel.lema@xmartlabs.com>
This commit is contained in:
0
preprocessing/__init__.py
Normal file
0
preprocessing/__init__.py
Normal file
155
preprocessing/create_wlasl_landmarks_dataset.py
Normal file
155
preprocessing/create_wlasl_landmarks_dataset.py
Normal file
@@ -0,0 +1,155 @@
|
||||
import os
|
||||
import os.path as op
|
||||
import json
|
||||
import shutil
|
||||
|
||||
import cv2
|
||||
import mediapipe as mp
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from utils import get_logger
|
||||
from tqdm.auto import tqdm
|
||||
from sklearn.model_selection import train_test_split
|
||||
from normalization.blazepose_mapping import map_blazepose_df
|
||||
|
||||
BASE_DATA_FOLDER = 'data/'
|
||||
|
||||
mp_drawing = mp.solutions.drawing_utils
|
||||
mp_drawing_styles = mp.solutions.drawing_styles
|
||||
mp_hands = mp.solutions.hands
|
||||
mp_holistic = mp.solutions.holistic
|
||||
pose_landmarks = mp_holistic.PoseLandmark
|
||||
hand_landmarks = mp_holistic.HandLandmark
|
||||
|
||||
|
||||
def get_landmarks_names():
|
||||
'''
|
||||
Returns landmark names for mediapipe holistic model
|
||||
'''
|
||||
pose_lmks = ','.join([f'{lmk.name.lower()}_x,{lmk.name.lower()}_y' for lmk in pose_landmarks])
|
||||
left_hand_lmks = ','.join([f'left_hand_{lmk.name.lower()}_x,left_hand_{lmk.name.lower()}_y'
|
||||
for lmk in hand_landmarks])
|
||||
right_hand_lmks = ','.join([f'right_hand_{lmk.name.lower()}_x,right_hand_{lmk.name.lower()}_y'
|
||||
for lmk in hand_landmarks])
|
||||
lmks_names = f'{pose_lmks},{left_hand_lmks},{right_hand_lmks}'
|
||||
return lmks_names
|
||||
|
||||
|
||||
def convert_to_str(arr, precision=6):
|
||||
if isinstance(arr, np.ndarray):
|
||||
values = []
|
||||
for val in arr:
|
||||
if val == 0:
|
||||
values.append('0')
|
||||
else:
|
||||
values.append(f'{val:.{precision}f}')
|
||||
return f"[{','.join(values)}]"
|
||||
else:
|
||||
return str(arr)
|
||||
|
||||
|
||||
def parse_create_args(parser):
|
||||
parser.add_argument('--landmarks-dataset', '-lmks', required=True,
|
||||
help='Path to folder with landmarks npy files. \
|
||||
You need to run `extract_mediapipe_landmarks.py` script first')
|
||||
parser.add_argument('--dataset-folder', '-df', default='data/wlasl',
|
||||
help='Path to folder where original `WLASL_v0.3.json` and `id_to_label.json` are stored. \
|
||||
Note that final CSV files will be saved in this folder too.')
|
||||
parser.add_argument('--videos-folder', '-videos', default=None,
|
||||
help='Path to folder with videos. If None, then no information of videos (fps, length, \
|
||||
width and height) will be stored in final csv file')
|
||||
parser.add_argument('--num-classes', '-nc', default=100, type=int, help='Number of classes to use in WLASL dataset')
|
||||
parser.add_argument('--create-new-split', action='store_true')
|
||||
parser.add_argument('--test-size', '-ts', default=0.25, type=float,
|
||||
help='Test split percentage size. Only required if --create-new-split is set')
|
||||
|
||||
|
||||
# python3 preprocessing.py --landmarks-dataset=data/landmarks -videos data/wlasl/videos
|
||||
def create(args):
|
||||
logger = get_logger(__name__)
|
||||
|
||||
landmarks_dataset = args.landmarks_dataset
|
||||
videos_folder = args.videos_folder
|
||||
dataset_folder = args.dataset_folder
|
||||
num_classes = args.num_classes
|
||||
test_size = args.test_size
|
||||
|
||||
os.makedirs(dataset_folder, exist_ok=True)
|
||||
|
||||
shutil.copy(os.path.join(BASE_DATA_FOLDER, 'wlasl/id_to_label.json'), dataset_folder)
|
||||
shutil.copy(os.path.join(BASE_DATA_FOLDER, 'wlasl/WLASL_v0.3.json'), dataset_folder)
|
||||
|
||||
wlasl_json_fn = op.join(dataset_folder, 'WLASL_v0.3.json')
|
||||
|
||||
with open(wlasl_json_fn) as fid:
|
||||
data = json.load(fid)
|
||||
|
||||
video_data = []
|
||||
for label_id, datum in enumerate(tqdm(data[:num_classes])):
|
||||
instances = []
|
||||
for instance in datum['instances']:
|
||||
instances.append(instance)
|
||||
video_id = instance['video_id']
|
||||
print(video_id)
|
||||
video_dict = {'video_id': video_id,
|
||||
'label_name': datum['gloss'],
|
||||
'labels': label_id,
|
||||
'split': instance['split']}
|
||||
if videos_folder is not None:
|
||||
cap = cv2.VideoCapture(op.join(videos_folder, f'{video_id}.mp4'))
|
||||
if not cap.isOpened():
|
||||
logger.warning(f'Video {video_id}.mp4 not found')
|
||||
continue
|
||||
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
|
||||
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
length = cap.get(cv2.CAP_PROP_FRAME_COUNT) / float(cap.get(cv2.CAP_PROP_FPS))
|
||||
video_info = {'video_width': width,
|
||||
'video_height': height,
|
||||
'fps': fps,
|
||||
'length': length}
|
||||
video_dict.update(video_info)
|
||||
video_data.append(video_dict)
|
||||
df_video = pd.DataFrame(video_data)
|
||||
video_ids = df_video['video_id'].unique()
|
||||
lmks_data = []
|
||||
lmks_names = get_landmarks_names().split(',')
|
||||
for video_id in video_ids:
|
||||
lmk_fn = op.join(landmarks_dataset, f'{video_id}.npy')
|
||||
if not op.exists(lmk_fn):
|
||||
logger.warning(f'{lmk_fn} file not found. Skipping')
|
||||
continue
|
||||
lmk = np.load(lmk_fn).T
|
||||
lmks_dict = {'video_id': video_id}
|
||||
for lmk_, name in zip(lmk, lmks_names):
|
||||
lmks_dict[name] = lmk_
|
||||
lmks_data.append(lmks_dict)
|
||||
|
||||
df_lmks = pd.DataFrame(lmks_data)
|
||||
print(df_lmks)
|
||||
df = pd.merge(df_video, df_lmks)
|
||||
print(df)
|
||||
aux_columns = ['split', 'video_id', 'labels', 'label_name']
|
||||
if videos_folder is not None:
|
||||
aux_columns += ['video_width', 'video_height', 'fps', 'length']
|
||||
df_aux = df[aux_columns]
|
||||
df = map_blazepose_df(df)
|
||||
df = pd.concat([df, df_aux], axis=1)
|
||||
if args.create_new_split:
|
||||
df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['labels'], random_state=42)
|
||||
else:
|
||||
print(df['split'].unique())
|
||||
df_train = df[(df['split'] == 'train') | (df['split'] == 'val')]
|
||||
df_test = df[df['split'] == 'test']
|
||||
|
||||
print(f'Num classes: {num_classes}')
|
||||
print(df_train['labels'].value_counts())
|
||||
assert set(df_train['labels'].unique()) == set(df_test['labels'].unique(
|
||||
)), 'The labels for train and test dataframe are different. We recommend to download the dataset again, or to use \
|
||||
the --create-new-split flag'
|
||||
for split, df_split in zip(['train', 'val'],
|
||||
[df_train, df_test]):
|
||||
fn_out = op.join(dataset_folder, f'WLASL{num_classes}_{split}.csv')
|
||||
(df_split.reset_index(drop=True)
|
||||
.applymap(convert_to_str)
|
||||
.to_csv(fn_out, index=False))
|
||||
154
preprocessing/extract_mediapipe_landmarks.py
Normal file
154
preprocessing/extract_mediapipe_landmarks.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import os
|
||||
import os.path as op
|
||||
from itertools import chain
|
||||
from collections import namedtuple
|
||||
import glob
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import mediapipe as mp
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
# Import drawing_utils and drawing_styles.
|
||||
mp_drawing = mp.solutions.drawing_utils
|
||||
mp_drawing_styles = mp.solutions.drawing_styles
|
||||
mp_holistic = mp.solutions.holistic
|
||||
mp_pose = mp.solutions.pose
|
||||
|
||||
LEN_LANDMARKS_POSE = len(mp_holistic.PoseLandmark)
|
||||
LEN_LANDMARKS_HAND = len(mp_holistic.HandLandmark)
|
||||
TOTAL_LANDMARKS = LEN_LANDMARKS_POSE + 2 * LEN_LANDMARKS_HAND
|
||||
|
||||
Landmark = namedtuple("Landmark", ["x", "y"])
|
||||
|
||||
|
||||
class LandmarksResults:
|
||||
"""
|
||||
Wrapper for landmarks results. When not available it fills with 0
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
results,
|
||||
num_landmarks_pose=LEN_LANDMARKS_POSE,
|
||||
num_landmarks_hand=LEN_LANDMARKS_HAND,
|
||||
):
|
||||
self.results = results
|
||||
self.num_landmarks_pose = num_landmarks_pose
|
||||
self.num_landmarks_hand = num_landmarks_hand
|
||||
|
||||
@property
|
||||
def pose_landmarks(self):
|
||||
if self.results.pose_landmarks is None:
|
||||
return [Landmark(0, 0)] * self.num_landmarks_pose
|
||||
else:
|
||||
return self.results.pose_landmarks.landmark
|
||||
|
||||
@property
|
||||
def left_hand_landmarks(self):
|
||||
if self.results.left_hand_landmarks is None:
|
||||
return [Landmark(0, 0)] * self.num_landmarks_hand
|
||||
else:
|
||||
return self.results.left_hand_landmarks.landmark
|
||||
|
||||
@property
|
||||
def right_hand_landmarks(self):
|
||||
if self.results.right_hand_landmarks is None:
|
||||
return [Landmark(0, 0)] * self.num_landmarks_hand
|
||||
else:
|
||||
return self.results.right_hand_landmarks.landmark
|
||||
|
||||
|
||||
def get_landmarks(image_orig, holistic, debug=False):
|
||||
"""
|
||||
Runs landmarks detection for single image
|
||||
Returns: list of landmarks
|
||||
"""
|
||||
# Convert the BGR image to RGB before processing.
|
||||
image = cv2.cvtColor(image_orig, cv2.COLOR_BGR2RGB)
|
||||
results = LandmarksResults(holistic.process(image))
|
||||
if debug:
|
||||
lmks_pose = []
|
||||
for lmk in results.pose_landmarks:
|
||||
lmks_pose.append(lmk.x)
|
||||
lmks_pose.append(lmk.y)
|
||||
assert len(lmks_pose) == LEN_LANDMARKS_POSE
|
||||
|
||||
lmks_left_hand = []
|
||||
|
||||
for lmk in results.left_hand_landmarks:
|
||||
lmks_left_hand.append(lmk.x)
|
||||
lmks_left_hand.append(lmk.y)
|
||||
|
||||
assert (
|
||||
len(lmks_left_hand) == 2 * LEN_LANDMARKS_HAND
|
||||
), f"{len(lmks_left_hand)} != {2 * LEN_LANDMARKS_HAND}"
|
||||
|
||||
lmks_right_hand = []
|
||||
|
||||
for lmk in results.right_hand_landmarks:
|
||||
lmks_right_hand.append(lmk.x)
|
||||
lmks_right_hand.append(lmk.y),
|
||||
|
||||
assert (
|
||||
len(lmks_right_hand) == 2 * LEN_LANDMARKS_HAND
|
||||
), f"{len(lmks_right_hand)} != {2 * LEN_LANDMARKS_HAND}"
|
||||
landmarks = []
|
||||
for lmk in chain(
|
||||
results.pose_landmarks,
|
||||
results.left_hand_landmarks,
|
||||
results.right_hand_landmarks,
|
||||
):
|
||||
landmarks.append(lmk.x)
|
||||
landmarks.append(lmk.y)
|
||||
assert (
|
||||
len(landmarks) == TOTAL_LANDMARKS * 2
|
||||
), f"{len(landmarks)} != {TOTAL_LANDMARKS * 2}"
|
||||
return landmarks
|
||||
|
||||
|
||||
def parse_extract_args(parser):
|
||||
parser.add_argument(
|
||||
"--videos-folder",
|
||||
"-videos",
|
||||
help="Path of folder with videos to extract landmarks from",
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-landmarks",
|
||||
"-lmks",
|
||||
help="Path of output folder where landmarks npy files will be saved",
|
||||
required=True,
|
||||
)
|
||||
|
||||
|
||||
# python3 preprocessing.py -videos=data/wlasl/videos_25fps/ -lmks=data/landmarks
|
||||
def extract(args):
|
||||
landmarks_output = args.output_landmarks
|
||||
videos_folder = args.videos_folder
|
||||
os.makedirs(landmarks_output, exist_ok=True)
|
||||
for fn_video in tqdm(sorted(glob.glob(op.join(videos_folder, "*mp4")))):
|
||||
cap = cv2.VideoCapture(fn_video)
|
||||
ret, image_orig = cap.read()
|
||||
height, width = image_orig.shape[:2]
|
||||
landmarks_video = []
|
||||
with tqdm(total=int(cap.get(cv2.CAP_PROP_FRAME_COUNT))) as pbar:
|
||||
with mp_holistic.Holistic(
|
||||
static_image_mode=False,
|
||||
min_detection_confidence=0.5,
|
||||
model_complexity=2,
|
||||
) as holistic:
|
||||
while ret:
|
||||
try:
|
||||
landmarks = get_landmarks(image_orig, holistic)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
landmarks = get_landmarks(image_orig, holistic, debug=True)
|
||||
ret, image_orig = cap.read()
|
||||
landmarks_video.append(landmarks)
|
||||
pbar.update(1)
|
||||
landmarks_video = np.vstack(landmarks_video)
|
||||
np.save(
|
||||
op.join(landmarks_output, op.basename(fn_video).split(".")[0]),
|
||||
landmarks_video,
|
||||
)
|
||||
Reference in New Issue
Block a user