Some changes to allow training with kaggle data

This commit is contained in:
2023-04-13 14:55:16 +00:00
parent c49645d7bc
commit 7c973f1b88
13 changed files with 1933 additions and 102 deletions

View File

@@ -61,20 +61,25 @@ def map_blazepose_keypoint(column):
return f"{mapped}_{hand}{suffix}" return f"{mapped}_{hand}{suffix}"
def map_blazepose_df(df): def map_blazepose_df(df, rename=True):
to_drop = []
if rename:
renamings = {}
for column in df.columns:
mapped_column = map_blazepose_keypoint(column)
if mapped_column:
renamings[column] = mapped_column
else:
to_drop.append(column)
df = df.rename(columns=renamings)
for index, row in df.iterrows(): for index, row in df.iterrows():
sequence_size = len(row["leftEar_Y"])
lsx = row["leftShoulder_X"] lsx = row["leftShoulder_X"]
rsx = row["rightShoulder_X"] rsx = row["rightShoulder_X"]
lsy = row["leftShoulder_Y"] lsy = row["leftShoulder_Y"]
rsy = row["rightShoulder_Y"] rsy = row["rightShoulder_Y"]
# convert all to list
lsx = lsx[1:-1].split(",")
rsx = rsx[1:-1].split(",")
lsy = lsy[1:-1].split(",")
rsy = rsy[1:-1].split(",")
sequence_size = len(lsx)
neck_x = [] neck_x = []
neck_y = [] neck_y = []
# Treat each element of the sequence (analyzed frame) individually # Treat each element of the sequence (analyzed frame) individually
@@ -84,4 +89,5 @@ def map_blazepose_df(df):
df.loc[index, "neck_X"] = str(neck_x) df.loc[index, "neck_X"] = str(neck_x)
df.loc[index, "neck_Y"] = str(neck_y) df.loc[index, "neck_Y"] = str(neck_y)
return df df.drop(columns=to_drop, inplace=True)
return df

View File

@@ -5,30 +5,30 @@ import pandas as pd
from normalization.hand_normalization import normalize_hands_full from normalization.hand_normalization import normalize_hands_full
from normalization.body_normalization import normalize_body_full from normalization.body_normalization import normalize_body_full
DATASET_PATH = './data/wlasl' DATASET_PATH = './data/processed'
# Load the dataset # Load the dataset
df = pd.read_csv(os.path.join(DATASET_PATH, "WLASL100_train.csv"), encoding="utf-8") df = pd.read_csv(os.path.join(DATASET_PATH, "spoter_train.csv"), encoding="utf-8")
print(df.head()) print(df.head())
print(df.columns) print(df.columns)
# Retrieve metadata # Retrieve metadata
video_size_heights = df["video_height"].to_list() # video_size_heights = df["video_height"].to_list()
video_size_widths = df["video_width"].to_list() # video_size_widths = df["video_width"].to_list()
# Delete redundant (non-related) properties # Delete redundant (non-related) properties
del df["video_height"] # del df["video_height"]
del df["video_width"] # del df["video_width"]
# Temporarily remove other relevant metadata # Temporarily remove other relevant metadata
labels = df["labels"].to_list() labels = df["labels"].to_list()
video_fps = df["fps"].to_list() signs = df["sign"].to_list()
del df["labels"] del df["labels"]
del df["fps"] del df["sign"]
del df["split"] del df["path"]
del df["video_id"] del df["participant_id"]
del df["label_name"] del df["sequence_id"]
del df["length"]
# Convert the strings into lists # Convert the strings into lists
@@ -41,7 +41,7 @@ for column in df.columns:
# Perform the normalizations # Perform the normalizations
df = normalize_hands_full(df) df = normalize_hands_full(df)
df, invalid_row_indexes = normalize_body_full(df) # df, invalid_row_indexes = normalize_body_full(df)
# Clear lists of items from deleted rows # Clear lists of items from deleted rows
# labels = [t for i, t in enumerate(labels) if i not in invalid_row_indexes] # labels = [t for i, t in enumerate(labels) if i not in invalid_row_indexes]
@@ -49,6 +49,6 @@ df, invalid_row_indexes = normalize_body_full(df)
# Return the metadata back to the dataset # Return the metadata back to the dataset
df["labels"] = labels df["labels"] = labels
df["fps"] = video_fps df["sign"] = signs
df.to_csv(os.path.join(DATASET_PATH, "wlasl_train_norm.csv"), encoding="utf-8", index=False) df.to_csv(os.path.join(DATASET_PATH, "spoter_train_norm.csv"), encoding="utf-8", index=False)

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,5 @@
from argparse import ArgumentParser from argparse import ArgumentParser
from preprocessing.create_wlasl_landmarks_dataset import parse_create_args, create from preprocessing.create_fingerspelling_dataset import parse_create_args, create
from preprocessing.extract_mediapipe_landmarks import parse_extract_args, extract from preprocessing.extract_mediapipe_landmarks import parse_extract_args, extract

View File

@@ -0,0 +1,172 @@
import os
import os.path as op
import json
import shutil
import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
from utils import get_logger
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from normalization.blazepose_mapping import map_blazepose_df
BASE_DATA_FOLDER = 'data/'
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands
mp_holistic = mp.solutions.holistic
pose_landmarks = mp_holistic.PoseLandmark
hand_landmarks = mp_holistic.HandLandmark
def get_landmarks_names():
'''
Returns landmark names for mediapipe holistic model
'''
pose_lmks = ','.join([f'{lmk.name.lower()}_x,{lmk.name.lower()}_y' for lmk in pose_landmarks])
left_hand_lmks = ','.join([f'left_hand_{lmk.name.lower()}_x,left_hand_{lmk.name.lower()}_y'
for lmk in hand_landmarks])
right_hand_lmks = ','.join([f'right_hand_{lmk.name.lower()}_x,right_hand_{lmk.name.lower()}_y'
for lmk in hand_landmarks])
lmks_names = f'{pose_lmks},{left_hand_lmks},{right_hand_lmks}'
return lmks_names
def convert_to_str(arr, precision=6):
if isinstance(arr, np.ndarray):
values = []
for val in arr:
if val == 0:
values.append('0')
else:
values.append(f'{val:.{precision}f}')
return f"[{','.join(values)}]"
else:
return str(arr)
def parse_create_args(parser):
parser.add_argument('--landmarks-dataset', '-lmks', required=True,
help='Path to folder with landmarks npy files. \
You need to run `extract_mediapipe_landmarks.py` script first')
parser.add_argument('--dataset-folder', '-df', default='data/wlasl',
help='Path to folder where original `WLASL_v0.3.json` and `id_to_label.json` are stored. \
Note that final CSV files will be saved in this folder too.')
parser.add_argument('--videos-folder', '-videos', default=None,
help='Path to folder with videos. If None, then no information of videos (fps, length, \
width and height) will be stored in final csv file')
parser.add_argument('--num-classes', '-nc', default=100, type=int, help='Number of classes to use in WLASL dataset')
parser.add_argument('--create-new-split', action='store_true')
parser.add_argument('--test-size', '-ts', default=0.25, type=float,
help='Test split percentage size. Only required if --create-new-split is set')
# python3 preprocessing.py --landmarks-dataset=data/landmarks -videos data/wlasl/videos
def create(args):
logger = get_logger(__name__)
landmarks_dataset = args.landmarks_dataset
videos_folder = args.videos_folder
dataset_folder = args.dataset_folder
num_classes = args.num_classes
test_size = args.test_size
os.makedirs(dataset_folder, exist_ok=True)
# shutil.copy(os.path.join(BASE_DATA_FOLDER, 'wlasl/id_to_label.json'), dataset_folder)
# shutil.copy(os.path.join(BASE_DATA_FOLDER, 'wlasl/WLASL_v0.3.json'), dataset_folder)
# get files in landmarks_dataset folder
landmarks_files = os.listdir(landmarks_dataset)
video_data = []
for i, file in enumerate(tqdm(landmarks_files)):
# split by !
label = file.split('!')[0]
subset = file.split('!')[1].split('.')[0]
# remove npy and set mp4
video_id = file.replace('.npy', "")
video_dict = {'video_id': video_id,
'label_name': label,
'split': subset}
if videos_folder is not None:
cap = cv2.VideoCapture(op.join(videos_folder, f'{video_id}.mp4'))
if not cap.isOpened():
logger.warning(f'Video {video_id}.mp4 not found')
continue
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
fps = cap.get(cv2.CAP_PROP_FPS)
length = cap.get(cv2.CAP_PROP_FRAME_COUNT) / float(cap.get(cv2.CAP_PROP_FPS))
video_info = {'video_width': width,
'video_height': height,
'fps': fps,
'length': length}
video_dict.update(video_info)
video_data.append(video_dict)
df_video = pd.DataFrame(video_data)
video_ids = df_video['video_id'].unique()
lmks_data = []
lmks_names = get_landmarks_names().split(',')
# get labels from df_video
labels = df_video['label_name'].unique()
# map labels to ids
label_to_id = {label: i for i, label in enumerate(labels)}
# add label_id column to df_video
df_video['labels'] = df_video['label_name'].map(label_to_id)
# export to json file as id to label
id_to_label = {i: label for label, i in label_to_id.items()}
with open(op.join(dataset_folder, 'id_to_label.json'), 'w') as f:
json.dump(id_to_label, f, indent=4)
for video_id in video_ids:
lmk_fn = op.join(landmarks_dataset, f'{video_id}.npy')
if not op.exists(lmk_fn):
logger.warning(f'{lmk_fn} file not found. Skipping')
continue
lmk = np.load(lmk_fn).T
lmks_dict = {'video_id': video_id}
for lmk_, name in zip(lmk, lmks_names):
lmks_dict[name] = lmk_
lmks_data.append(lmks_dict)
df_lmks = pd.DataFrame(lmks_data)
print(df_lmks)
df = pd.merge(df_video, df_lmks)
print(df)
aux_columns = ['split', 'video_id', 'labels', 'label_name']
if videos_folder is not None:
aux_columns += ['video_width', 'video_height', 'fps', 'length']
df_aux = df[aux_columns]
df = map_blazepose_df(df)
df = pd.concat([df, df_aux], axis=1)
if args.create_new_split:
df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['labels'], random_state=42)
else:
print(df['split'].unique())
df_train = df[(df['split'] == 'train') | (df['split'] == 'val')]
df_test = df[df['split'] == 'test']
print(f'Num classes: {num_classes}')
print(df_train['labels'].value_counts())
assert set(df_train['labels'].unique()) == set(df_test['labels'].unique(
)), 'The labels for train and test dataframe are different. We recommend to download the dataset again, or to use \
the --create-new-split flag'
for split, df_split in zip(['train', 'val'],
[df_train, df_test]):
fn_out = op.join(dataset_folder, f'fingerspelling_{split}.csv')
(df_split.reset_index(drop=True)
.applymap(convert_to_str)
.to_csv(fn_out, index=False))

View File

@@ -4,6 +4,8 @@ import pandas as pd
from tqdm.auto import tqdm from tqdm.auto import tqdm
import json import json
from normalization.blazepose_mapping import map_blazepose_df
def create(train_landmark_files, train_csv, dataset_folder, test_size): def create(train_landmark_files, train_csv, dataset_folder, test_size):
os.makedirs(dataset_folder, exist_ok=True) os.makedirs(dataset_folder, exist_ok=True)
@@ -17,15 +19,15 @@ def create(train_landmark_files, train_csv, dataset_folder, test_size):
mapping = { mapping = {
'pose_0': 'nose', 'pose_0': 'nose',
'pose_1': 'leftEye', 'pose_1': 'leftEye',
'pose_2': 'rightEye', 'pose_4': 'rightEye',
'pose_3': 'leftEar', 'pose_7': 'leftEar',
'pose_4': 'rightEar', 'pose_8': 'rightEar',
'pose_5': 'leftShoulder', 'pose_11': 'leftShoulder',
'pose_6': 'rightShoulder', 'pose_12': 'rightShoulder',
'pose_7': 'leftElbow', 'pose_13': 'leftElbow',
'pose_8': 'rightElbow', 'pose_14': 'rightElbow',
'pose_9': 'leftWrist', 'pose_15': 'leftWrist',
'pose_10': 'rightWrist', 'pose_16': 'rightWrist',
'left_hand_0': 'wrist_left', 'left_hand_0': 'wrist_left',
'left_hand_1': 'thumbCMC_left', 'left_hand_1': 'thumbCMC_left',
@@ -77,7 +79,7 @@ def create(train_landmark_files, train_csv, dataset_folder, test_size):
columns.append(f'{v}_X') columns.append(f'{v}_X')
columns.append(f'{v}_Y') columns.append(f'{v}_Y')
for _, row in tqdm(train_df.head(6000).iterrows(), total=6000): for _, row in tqdm(train_df.head(10000).iterrows(), total=10000):
path, participant_id, sequence_id, sign = row['path'], row['participant_id'], row['sequence_id'], row['sign'] path, participant_id, sequence_id, sign = row['path'], row['participant_id'], row['sequence_id'], row['sign']
parquet_file = os.path.join(train_landmark_files, str(participant_id), f"{sequence_id}.parquet") parquet_file = os.path.join(train_landmark_files, str(participant_id), f"{sequence_id}.parquet")
@@ -136,6 +138,7 @@ def create(train_landmark_files, train_csv, dataset_folder, test_size):
video_data.append(new_landmark_data) video_data.append(new_landmark_data)
video_data = pd.concat(video_data, axis=0, ignore_index=True) video_data = pd.concat(video_data, axis=0, ignore_index=True)
video_data = map_blazepose_df(video_data, rename=False)
video_data.to_csv(os.path.join(dataset_folder, 'spoter.csv'), index=False) video_data.to_csv(os.path.join(dataset_folder, 'spoter.csv'), index=False)
train_landmark_files = 'data/train_landmark_files' train_landmark_files = 'data/train_landmark_files'

View File

@@ -110,6 +110,7 @@ def create(args):
'length': length} 'length': length}
video_dict.update(video_info) video_dict.update(video_info)
video_data.append(video_dict) video_data.append(video_dict)
df_video = pd.DataFrame(video_data) df_video = pd.DataFrame(video_data)
video_ids = df_video['video_id'].unique() video_ids = df_video['video_id'].unique()
lmks_data = [] lmks_data = []
@@ -126,9 +127,7 @@ def create(args):
lmks_data.append(lmks_dict) lmks_data.append(lmks_dict)
df_lmks = pd.DataFrame(lmks_data) df_lmks = pd.DataFrame(lmks_data)
print(df_lmks)
df = pd.merge(df_video, df_lmks) df = pd.merge(df_video, df_lmks)
print(df)
aux_columns = ['split', 'video_id', 'labels', 'label_name'] aux_columns = ['split', 'video_id', 'labels', 'label_name']
if videos_folder is not None: if videos_folder is not None:
aux_columns += ['video_width', 'video_height', 'fps', 'length'] aux_columns += ['video_width', 'video_height', 'fps', 'length']

View File

@@ -132,6 +132,12 @@ def extract(args):
ret, image_orig = cap.read() ret, image_orig = cap.read()
height, width = image_orig.shape[:2] height, width = image_orig.shape[:2]
landmarks_video = [] landmarks_video = []
# make sure fps is 20 by determining the number of frames to be skipped
frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
frame_skip = (frame_rate // 20) - 1
with tqdm(total=int(cap.get(cv2.CAP_PROP_FRAME_COUNT))) as pbar: with tqdm(total=int(cap.get(cv2.CAP_PROP_FRAME_COUNT))) as pbar:
with mp_holistic.Holistic( with mp_holistic.Holistic(
static_image_mode=False, static_image_mode=False,
@@ -145,6 +151,9 @@ def extract(args):
print(e) print(e)
landmarks = get_landmarks(image_orig, holistic, debug=True) landmarks = get_landmarks(image_orig, holistic, debug=True)
ret, image_orig = cap.read() ret, image_orig = cap.read()
for _ in range(frame_skip):
ret, image_orig = cap.read()
pbar.update(1)
landmarks_video.append(landmarks) landmarks_video.append(landmarks)
pbar.update(1) pbar.update(1)
landmarks_video = np.vstack(landmarks_video) landmarks_video = np.vstack(landmarks_video)

View File

@@ -8,7 +8,6 @@ dataset = "data/processed/spoter.csv"
# read the dataset # read the dataset
df = pd.read_csv(dataset) df = pd.read_csv(dataset)
df = map_blazepose_df(df)
with open("data/sign_to_prediction_index_map.json", "r") as f: with open("data/sign_to_prediction_index_map.json", "r") as f:
sign_to_prediction_index_max = json.load(f) sign_to_prediction_index_max = json.load(f)

View File

@@ -1,7 +1,6 @@
pandas pandas
bokeh==2.4.3 bokeh==2.4.3
boto3>=1.9 boto3>=1.9
clearml==1.6.4
ipywidgets==8.0.4 ipywidgets==8.0.4
matplotlib==3.5.3 matplotlib==3.5.3
mediapipe==0.8.11 mediapipe==0.8.11
@@ -9,6 +8,7 @@ notebook==6.5.2
opencv-python==4.6.0.66 opencv-python==4.6.0.66
plotly==5.11.0 plotly==5.11.0
scikit-learn==1.0.2 scikit-learn==1.0.2
clearml==1.10.3
torch torch
torchvision torchvision
tqdm==4.54.1 tqdm==4.54.1

View File

@@ -15,7 +15,7 @@ from torchvision import transforms
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from pathlib import Path from pathlib import Path
import copy import copy
import numpy as np
from datasets import CzechSLRDataset, SLREmbeddingDataset, collate_fn_triplet_padd, collate_fn_padd from datasets import CzechSLRDataset, SLREmbeddingDataset, collate_fn_triplet_padd, collate_fn_padd
from models import SPOTER, SPOTER_EMBEDDINGS, train_epoch, evaluate, train_epoch_embedding, \ from models import SPOTER, SPOTER_EMBEDDINGS, train_epoch, evaluate, train_epoch_embedding, \
train_epoch_embedding_online, evaluate_embedding train_epoch_embedding_online, evaluate_embedding
@@ -32,7 +32,7 @@ except ImportError:
pass pass
PROJECT_NAME = "spoter" PROJECT_NAME = "SpoterEmbedding"
CLEARML = "clearml" CLEARML = "clearml"

View File

@@ -1,22 +1,21 @@
#!/bin/sh #!/bin/sh
python -m train \ python -m train \
--save_checkpoints_every 10 \ --save_checkpoints_every 10 \
--experiment_name "augment_rotate_75_x8" \ --experiment_name "basic" \
--epochs 300 \ --epochs 300 \
--optimizer "ADAM" \ --optimizer "ADAM" \
--lr 0.001 \ --lr 0.0001 \
--batch_size 16 \ --batch_size 16 \
--dataset_name "processed" \ --dataset_name "GoogleWLASL" \
--training_set_path "spoter_train.csv" \ --training_set_path "spoter_train.csv" \
--validation_set_path "spoter_test.csv" \ --validation_set_path "spoter_test.csv" \
--vector_length 32 \ --vector_length 32 \
--epoch_iters -1 \ --epoch_iters -1 \
--scheduler_factor 0 \ --scheduler_factor 0.2 \
--hard_triplet_mining "in_batch" \ --hard_triplet_mining "None" \
--filter_easy_triplets \ --filter_easy_triplets \
--triplet_loss_margin 1 \ --triplet_loss_margin 2 \
--dropout 0.2 \ --dropout 0.2 \
--augmentations_prob=0.75 \ --tracker=clearml \
--hard_mining_scheduler_triplets_threshold=0 \ --dataset_loader=clearml \
--normalize_embeddings \ --dataset_project="SpoterEmbedding"
--num_classes 100 \

1632
visualize_data.ipynb Normal file

File diff suppressed because one or more lines are too long