Added embeddings

2023-04-14 14:25:33 +02:00
parent ed0e0f198b
commit 8771fe0721
4 changed files with 4514 additions and 39 deletions
--- a/checkpoints/checkpoint_embed_1105.pth
+++ b/checkpoints/checkpoint_embed_1105.pth
--- a/embeddings/basic-signs/embeddings.csv
+++ b/embeddings/basic-signs/embeddings.csv
--- a/embeddings/fingerspelling/embeddings.csv
+++ b/embeddings/fingerspelling/embeddings.csv
--- a/webcam.py
+++ b/webcam.py
@@ -1,4 +1,6 @@
 from collections import Counter
 import cv2
 import mediapipe as mp
 import numpy as np
@@ -192,7 +194,7 @@ def normalize_hand(keypoints):
 # load training embedding csv
-df = pd.read_csv('data/fingerspelling/embeddings.csv')
+df = pd.read_csv('embeddings/basic-signs/embeddings.csv')
 def minkowski_distance_p(x, y, p=2):
    x = np.asarray(x)
@@ -248,7 +250,7 @@ def distance_matrix(keypoints, embeddings, p=2, threshold=1000000):
        return result
-CHECKPOINT_PATH = "out_checkpoints/checkpoint_embed_1105.pth"
+CHECKPOINT_PATH = "checkpoints/checkpoint_embed_1105.pth"
 checkpoint = torch.load(CHECKPOINT_PATH, map_location=device)
 model = SPOTER_EMBEDDINGS(
@@ -281,51 +283,60 @@ def make_prediction(keypoints):
    # calculate distance matrix
    dist_matrix = distance_matrix(new_embeddings, embeddings, p=2, threshold=1000000)
-    # find closest match
+    # get the 5 closest matches and select the class that is most common and use the average distance as the score
-    closest_match = np.argmin(dist_matrix[0])
+    # get the 5 closest matches
    indeces = np.argsort(dist_matrix)[0][:5]
    # get the labels
    labels = df["label_name"].iloc[indeces].tolist()
    c = Counter(labels).most_common()[0][0]
-    # if dist_matrix[0][closest_match] < 2:
+    # filter indeces to only include the most common label
-    return df.iloc[closest_match]["label_name"], dist_matrix[0][closest_match]
+    indeces = [i for i in indeces if df["label_name"].iloc[i] == c]
    # get the average distance
    score = np.mean(dist_matrix[0][indeces])
    return c, score
 # open webcam stream
-cap = cv2.VideoCapture(0)
+# cap = cv2.VideoCapture(0)
 while cap.isOpened():
    # read frame
    ret, frame = cap.read()
    pose = extract_keypoints(frame)
    if pose is None:
        cv2.imshow('MediaPipe Hands', frame)
        continue
    buffer.append(pose)
    if len(buffer) > 15:
        buffer.pop(0)
    if len(buffer) == 15:
        label, score = make_prediction(buffer)
        # draw label
        cv2.putText(frame, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
        cv2.putText(frame, str(score), (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
    # Show the frame
    cv2.imshow('MediaPipe Hands', frame)
    # Wait for key press to exit
    if cv2.waitKey(5) & 0xFF == 27:
        break
 # open video A.mp4
 # cap = cv2.VideoCapture('Z.mp4')
 # while cap.isOpened():
 #     # read frame
 #     ret, frame = cap.read()
 #     if frame is None:
 #         break
 #     pose = extract_keypoints(frame)
-#     buffer.append(pose)
+#     if pose is None:
 #         cv2.imshow('MediaPipe Hands', frame)
 #         continue
-# make_prediction(buffer)
+#     buffer.append(pose)
 #     if len(buffer) > 15:
 #         buffer.pop(0)
 #     if len(buffer) == 15:
 #         label, score = make_prediction(buffer)
 #         # draw label
 #         cv2.putText(frame, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
 #         cv2.putText(frame, str(score), (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
 #     # Show the frame
 #     cv2.imshow('MediaPipe Hands', frame)
 #     # Wait for key press to exit
 #     if cv2.waitKey(5) & 0xFF == 27:
 #         break
 # open video A.mp4
 cap = cv2.VideoCapture('E.mp4')
 while cap.isOpened():
    # read frame
    ret, frame = cap.read()
    if frame is None:
        break
    pose = extract_keypoints(frame)
    buffer.append(pose)
 label, score = make_prediction(buffer)
 print(label, score)