Week 12: Gesture Recognition for Human-Robot Interaction

Gesture Recognition

Hand Gesture Detection with MediaPipe

MediaPipe is Google's open-source framework for real-time perception pipelines, offering pre-trained models for hand tracking, pose estimation, face detection, and holistic body tracking. For humanoid robotics, hand gesture recognition enables non-verbal communication—pointing to objects, signaling stop/go, or demonstrating manipulation trajectories.

Why MediaPipe for Robotics?

Real-Time Performance: MediaPipe achieves 30+ FPS on CPU and 60+ FPS on GPU, suitable for responsive human-robot interaction without specialized hardware.

Cross-Platform: Runs on Linux, Windows, mobile devices, and embedded systems (Raspberry Pi, Jetson Nano), enabling deployment on diverse robot platforms.

Pre-Trained Models: MediaPipe's hand tracker identifies 21 3D landmarks per hand (fingertips, knuckles, wrist) without custom training—no dataset annotation required.

Multi-Hand Support: Detects and tracks multiple hands simultaneously, distinguishing left/right hand and maintaining identity across frames.

Installation and Basic Hand Tracking

# Install MediaPipe and OpenCV for visualization
import subprocess
subprocess.run(["pip", "install", "mediapipe", "opencv-python", "numpy"])

import cv2
import mediapipe as mp
import numpy as np

# Initialize MediaPipe hand tracking
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

def track_hands_in_video(video_source=0):
    """
    Real-time hand tracking from webcam or video file.

    Args:
        video_source: Camera index (0 for default webcam) or video file path
    """
    # Configure hand detector
    # max_num_hands: Maximum hands to detect (1-2 for most robotics tasks)
    # min_detection_confidence: Threshold for initial hand detection (0.0-1.0)
    # min_tracking_confidence: Threshold for landmark tracking (0.0-1.0)
    hands = mp_hands.Hands(
        static_image_mode=False,  # False for video stream (enables tracking)
        max_num_hands=2,
        min_detection_confidence=0.7,  # Higher = fewer false positives
        min_tracking_confidence=0.5
    )

    cap = cv2.VideoCapture(video_source)

    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break

        # Convert BGR to RGB (MediaPipe expects RGB)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Process frame to detect hands
        results = hands.process(frame_rgb)

        # Draw hand landmarks on frame
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # Draw 21 landmarks and connections
                mp_drawing.draw_landmarks(
                    frame,
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS,
                    mp_drawing_styles.get_default_hand_landmarks_style(),
                    mp_drawing_styles.get_default_hand_connections_style()
                )

        # Display annotated frame
        cv2.imshow('Hand Tracking', frame)

        if cv2.waitKey(5) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    hands.close()

# Run hand tracker
track_hands_in_video()

Gesture Command Mapping

Map hand poses to robot commands by analyzing landmark configurations:

from typing import Optional, Dict
from dataclasses import dataclass

@dataclass
class HandGesture:
    """Represents a recognized gesture with confidence score."""
    name: str
    confidence: float
    parameters: Dict  # Additional info (e.g., pointing direction)

class GestureRecognizer:
    """
    Classify hand gestures from MediaPipe landmarks.
    Maps gestures to robot commands.
    """

    def __init__(self):
        self.hands = mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=1,  # Single hand for command gestures
            min_detection_confidence=0.7,
            min_tracking_confidence=0.5
        )

    def recognize_gesture(self, frame: np.ndarray) -> Optional[HandGesture]:
        """
        Detect and classify gesture in image frame.

        Args:
            frame: BGR image from camera

        Returns:
            HandGesture object or None if no gesture detected
        """
        # Process frame
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = self.hands.process(frame_rgb)

        if not results.multi_hand_landmarks:
            return None

        # Analyze first detected hand
        landmarks = results.multi_hand_landmarks[0]

        # Convert landmarks to numpy array for easier computation
        # Each landmark has x, y, z coordinates (normalized to [0, 1])
        points = np.array([
            [lm.x, lm.y, lm.z]
            for lm in landmarks.landmark
        ])

        # Classify gesture based on landmark configuration
        if self._is_thumbs_up(points):
            return HandGesture("thumbs_up", 0.95, {})
        elif self._is_pointing(points):
            direction = self._get_pointing_direction(points)
            return HandGesture("pointing", 0.90, {"direction": direction})
        elif self._is_open_palm(points):
            return HandGesture("stop", 0.92, {})
        elif self._is_closed_fist(points):
            return HandGesture("grab", 0.88, {})
        elif self._is_peace_sign(points):
            return HandGesture("peace", 0.85, {})

        return HandGesture("unknown", 0.5, {})

    def _is_thumbs_up(self, points: np.ndarray) -> bool:
        """
        Detect thumbs-up gesture.
        Criteria: Thumb tip above MCP joint, other fingers curled.
        """
        # Landmark indices (see MediaPipe hand landmark diagram)
        thumb_tip = points[4]
        thumb_mcp = points[2]
        index_tip = points[8]
        index_pip = points[6]

        # Thumb extended upward
        thumb_extended = thumb_tip[1] < thumb_mcp[1]  # y decreases upward

        # Other fingers curled (tip below PIP joint)
        fingers_curled = all([
            points[8][1] > points[6][1],   # Index
            points[12][1] > points[10][1],  # Middle
            points[16][1] > points[14][1],  # Ring
            points[20][1] > points[18][1]   # Pinky
        ])

        return thumb_extended and fingers_curled

    def _is_pointing(self, points: np.ndarray) -> bool:
        """
        Detect pointing gesture.
        Criteria: Index finger extended, other fingers curled.
        """
        # Index finger extended
        index_extended = points[8][1] < points[6][1]  # Tip above PIP

        # Other fingers curled
        other_curled = all([
            points[12][1] > points[10][1],  # Middle
            points[16][1] > points[14][1],  # Ring
            points[20][1] > points[18][1]   # Pinky
        ])

        return index_extended and other_curled

    def _is_open_palm(self, points: np.ndarray) -> bool:
        """
        Detect open palm (stop signal).
        Criteria: All fingers extended and spread.
        """
        # All fingertips above their respective MCP joints
        fingers_extended = all([
            points[4][1] < points[2][1],    # Thumb
            points[8][1] < points[5][1],    # Index
            points[12][1] < points[9][1],   # Middle
            points[16][1] < points[13][1],  # Ring
            points[20][1] < points[17][1]   # Pinky
        ])

        return fingers_extended

    def _is_closed_fist(self, points: np.ndarray) -> bool:
        """
        Detect closed fist (grab command).
        Criteria: All fingers curled below MCP joints.
        """
        all_curled = all([
            points[8][1] > points[5][1],    # Index
            points[12][1] > points[9][1],   # Middle
            points[16][1] > points[13][1],  # Ring
            points[20][1] > points[17][1]   # Pinky
        ])

        return all_curled

    def _is_peace_sign(self, points: np.ndarray) -> bool:
        """
        Detect peace sign (index + middle extended).
        """
        index_extended = points[8][1] < points[6][1]
        middle_extended = points[12][1] < points[10][1]
        ring_curled = points[16][1] > points[14][1]
        pinky_curled = points[20][1] > points[18][1]

        return index_extended and middle_extended and ring_curled and pinky_curled

    def _get_pointing_direction(self, points: np.ndarray) -> str:
        """
        Compute pointing direction from index finger orientation.

        Returns:
            Direction string: "left", "right", "up", "down", "forward"
        """
        # Vector from index MCP to tip
        mcp = points[5]
        tip = points[8]
        vector = tip - mcp

        # Analyze primary component
        if abs(vector[0]) > abs(vector[1]):
            return "right" if vector[0] > 0 else "left"
        else:
            return "down" if vector[1] > 0 else "up"

# Example: Real-time gesture recognition
recognizer = GestureRecognizer()
cap = cv2.VideoCapture(0)

while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    gesture = recognizer.recognize_gesture(frame)

    if gesture and gesture.confidence > 0.8:
        # Display recognized gesture
        text = f"{gesture.name} ({gesture.confidence:.2f})"
        cv2.putText(frame, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX,
                    1, (0, 255, 0), 2)

        # Execute robot command based on gesture
        if gesture.name == "pointing":
            print(f"Robot: Navigate {gesture.parameters['direction']}")
        elif gesture.name == "grab":
            print("Robot: Execute grasp")
        elif gesture.name == "stop":
            print("Robot: Emergency stop")

    cv2.imshow('Gesture Recognition', frame)
    if cv2.waitKey(5) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Pose Estimation for Human-Robot Interaction

Pose estimation tracks full-body landmarks (33 points including shoulders, elbows, hips, knees) to understand human activity and spatial positioning. For collaborative robotics, this enables:

Proximity Detection: Stop robot arm if human enters workspace
Activity Recognition: Distinguish standing, sitting, walking for context-aware behavior
Gesture Demonstration: Learn manipulation tasks by imitating human arm movements

Full-Body Pose Tracking

mp_pose = mp.solutions.pose

def track_human_pose(video_source=0):
    """
    Track human body pose for safety monitoring and imitation learning.

    Args:
        video_source: Camera index or video file
    """
    pose = mp_pose.Pose(
        static_image_mode=False,
        model_complexity=1,  # 0=lite, 1=full, 2=heavy (higher = more accurate)
        smooth_landmarks=True,  # Temporal smoothing for video
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )

    cap = cv2.VideoCapture(video_source)

    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = pose.process(frame_rgb)

        if results.pose_landmarks:
            # Draw pose skeleton
            mp_drawing.draw_landmarks(
                frame,
                results.pose_landmarks,
                mp_pose.POSE_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style()
            )

            # Extract key joint positions for analysis
            landmarks = results.pose_landmarks.landmark
            left_wrist = landmarks[mp_pose.PoseLandmark.LEFT_WRIST]
            right_wrist = landmarks[mp_pose.PoseLandmark.RIGHT_WRIST]

            # Example: Detect if hands are raised (y < 0.5)
            hands_raised = (left_wrist.y < 0.5 and right_wrist.y < 0.5)

            if hands_raised:
                cv2.putText(frame, "Hands Raised - Robot Paused", (10, 60),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

        cv2.imshow('Pose Tracking', frame)
        if cv2.waitKey(5) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    pose.close()

# Run pose tracker
track_human_pose()

Integrating Gestures with ROS 2

Connect gesture recognition to robot control:

import rclpy
from rclpy.node import Node
from std_msgs.msg import String

class GestureControlNode(Node):
    """
    ROS 2 node that publishes robot commands based on hand gestures.
    """

    def __init__(self):
        super().__init__('gesture_control')

        # Publisher for gesture-based commands
        self.cmd_pub = self.create_publisher(String, '/gesture_commands', 10)

        # Timer for periodic gesture recognition
        self.timer = self.create_timer(0.1, self.timer_callback)  # 10 Hz

        self.recognizer = GestureRecognizer()
        self.cap = cv2.VideoCapture(0)

        self.last_gesture = None
        self.gesture_stable_count = 0
        self.STABILITY_THRESHOLD = 5  # Require 5 consecutive frames for confirmation

    def timer_callback(self):
        """Process camera frame and publish gesture commands."""
        success, frame = self.cap.read()
        if not success:
            return

        gesture = self.recognizer.recognize_gesture(frame)

        if gesture and gesture.confidence > 0.85:
            # Require gesture stability to avoid false triggers
            if gesture.name == self.last_gesture:
                self.gesture_stable_count += 1
            else:
                self.gesture_stable_count = 0
                self.last_gesture = gesture.name

            # Publish command if gesture is stable
            if self.gesture_stable_count >= self.STABILITY_THRESHOLD:
                msg = String()
                msg.data = f"{gesture.name}:{gesture.parameters}"
                self.cmd_pub.publish(msg)

                self.get_logger().info(f"Published gesture command: {gesture.name}")

                # Reset to avoid repeated commands
                self.gesture_stable_count = 0

def main():
    rclpy.init()
    node = GestureControlNode()
    rclpy.spin(node)
    node.destroy_node()
    rclpy.shutdown()

Practice Exercise

Build a gesture-controlled navigation system:

Implement pointing direction detection (8 directions: N, NE, E, SE, S, SW, W, NW)
Map gestures to navigation commands:
- Point → Navigate in indicated direction
- Open palm → Stop
- Closed fist → Return to home position
Add gesture confirmation: Require 1-second hold before executing command
Implement safety checks: Stop robot if human enters proximity (use pose estimation)

Next Steps

You've completed multimodal perception (voice, vision, gestures). In Week 13: Capstone Project, you'll integrate all components into an autonomous humanoid system with conversational AI, visual grounding, and adaptive task execution.

Hand Gesture Detection with MediaPipe​

Why MediaPipe for Robotics?​

Installation and Basic Hand Tracking​

Gesture Command Mapping​

Pose Estimation for Human-Robot Interaction​

Full-Body Pose Tracking​

Integrating Gestures with ROS 2​

Practice Exercise​

Next Steps​