手势控制项目 那我们现在就按手势控制项目来提供完整实现方案。该项目的核心是通过摄像头识别手部关键点对手势握拳、OK、比心、伸大拇指等进行分类将分类结果映射为键盘事件实现翻页PPT、控制音量、音乐播放等操作。要求必须包含前端数据集来自Kaggle。一、整体技术路线模块技术选择说明手部关键点提取MediaPipe Hands轻量、实时提供21个手部关键点坐标手势分类轻量级神经网络MLP或随机森林基于关键点相对位置或角度特征分类后端服务Flask接收前端图片返回手势类别并触发键盘事件前端界面HTML JavaScript 摄像头调用摄像头每帧发送给后端显示识别结果键盘事件模拟pyautogui / keyboard库在本地执行按键操作音量、PPT翻页等数据集Kaggle手势数据集如 LeapGestRecog 或自制补充用于训练分类器二、数据集准备来自Kaggle推荐使用LeapGestRecog数据集Kaggle链接Checking your browser - reCAPTCHA它包含10种手势包括握拳、OK、比心、大拇指等共约20,000张灰度图。也可以使用Hand Gesture Recognition DatasetChecking your browser - reCAPTCHA等。下载后我们需要提取每张图片的手部关键点用MediaPipe预处理然后保存关键点特征和标签用于训练分类器。如果不想重新训练也可以直接用MediaPipe提供的姿势分类示例但为了满足“小模型”要求我们依然训练一个简单的MLP。三、项目文件结构textgesture_control/ ├── backend/ │ ├── app.py # Flask主程序 │ ├── gesture_classifier.py # 手势分类模型加载与预测 │ ├── keypoint_extractor.py # MediaPipe提取关键点 │ ├── train_model.py # 训练分类器用Kaggle数据 │ ├── gesture_model.pkl # 训练好的分类器如RandomForest或MLP │ ├── requirements.txt │ └── templates/ │ └── index.html # 前端页面也可放在frontend/ ├── frontend/ # 若前后分离这里为简洁直接用templates │ └── static/ │ ├── style.css │ └── script.js └── data/ # 存放Kaggle数据集或预处理后的特征四、数据集预处理与模型训练4.1 提取关键点特征keypoint_extractor.pypythonimport cv2 import mediapipe as mp import numpy as np import os mp_hands mp.solutions.hands hands mp_hands.Hands(static_image_modeTrue, max_num_hands1, min_detection_confidence0.5) def extract_hand_landmarks(image_path): 从图片中提取21个关键点的归一化坐标x, y共42维 img cv2.imread(image_path) rgb cv2.cvtColor(img, cv2.COLOR_BGR2RGB) results hands.process(rgb) if not results.multi_hand_landmarks: return None landmarks results.multi_hand_landmarks[0] # 归一化相对于手腕0点的偏移或直接使用归一化坐标 features [] for lm in landmarks.landmark: features.extend([lm.x, lm.y]) # 共42维 return np.array(features) def process_dataset(data_folder, label_map): 遍历data_folder下的子文件夹每个子文件夹代表一类手势 提取特征并保存为npy X, y [], [] for label_name, label_id in label_map.items(): folder os.path.join(data_folder, label_name) for file in os.listdir(folder): if file.endswith((.png, .jpg)): feat extract_hand_landmarks(os.path.join(folder, file)) if feat is not None: X.append(feat) y.append(label_id) return np.array(X), np.array(y)4.2 训练分类器train_model.py假设我们只选择4种手势握拳(0)、OK(1)、比心(2)、大拇指(3)。我们可以用Kaggle数据集中的对应类别或者自己拍摄补充。训练一个简单的MLP多层感知机pythonimport numpy as np from sklearn.neural_network import MLPClassifier from sklearn.model_selection import train_test_split import joblib # 假设已经提取好X, y X, y ... # 从上面函数获得 X_train, X_test, y_train, y_test train_test_split(X, y, test_size0.2, random_state42) clf MLPClassifier(hidden_layer_sizes(64, 32), activationrelu, max_iter500, random_state42) clf.fit(X_train, y_train) print(测试准确率:, clf.score(X_test, y_test)) # 保存模型 joblib.dump(clf, gesture_model.pkl)也可以直接用随机森林更轻量pythonfrom sklearn.ensemble import RandomForestClassifier clf RandomForestClassifier(n_estimators50, max_depth10)五、后端 Flask 实现5.1 依赖文件requirements.txttextflask flask-cors mediapipe opencv-python numpy scikit-learn joblib pyautogui keyboard5.2 主程序app.pypythonfrom flask import Flask, request, jsonify, render_template import cv2 import numpy as np import base64 from gesture_classifier import predict_gesture import pyautogui import keyboard import threading import time app Flask(__name__) # 手势到键盘操作的映射 GESTURE_ACTION { 0: volume_down, # 握拳 - 降低音量 1: volume_up, # OK - 增加音量 2: next_track, # 比心 - 下一首 3: play_pause, # 大拇指 - 播放/暂停 } # 定义执行动作的函数为避免阻塞使用线程 def perform_action(action): if action volume_down: pyautogui.press(volumedown) elif action volume_up: pyautogui.press(volumeup) elif action next_track: pyautogui.press(nexttrack) elif action play_pause: pyautogui.press(playpause) # 也可用于PPT翻页left/right app.route(/) def index(): return render_template(index.html) app.route(/predict, methods[POST]) def predict(): data request.get_json() image_data data[image].split(,)[1] # base64编码 img_bytes base64.b64decode(image_data) np_arr np.frombuffer(img_bytes, np.uint8) frame cv2.imdecode(np_arr, cv2.IMREAD_COLOR) # 调用识别函数 gesture_id, confidence predict_gesture(frame) # 如果置信度足够高执行动作 if confidence 0.7: action GESTURE_ACTION.get(gesture_id) if action: # 在单独线程中执行避免阻塞响应 threading.Thread(targetperform_action, args(action,)).start() # 返回结果给前端用于显示 gesture_names [握拳, OK, 比心, 大拇指] return jsonify({ gesture: gesture_names[gesture_id] if gesture_id is not None else unknown, confidence: confidence, action: action if confidence 0.7 else None }) if __name__ __main__: app.run(debugTrue, host0.0.0.0, port5000)5.3 分类器加载gesture_classifier.pypythonimport cv2 import mediapipe as mp import numpy as np import joblib mp_hands mp.solutions.hands hands mp_hands.Hands(static_image_modeFalse, max_num_hands1, min_detection_confidence0.5) model joblib.load(gesture_model.pkl) def extract_features_from_frame(frame): rgb cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) results hands.process(rgb) if not results.multi_hand_landmarks: return None landmarks results.multi_hand_landmarks[0] features [] for lm in landmarks.landmark: features.extend([lm.x, lm.y]) return np.array(features).reshape(1, -1) def predict_gesture(frame): feat extract_features_from_frame(frame) if feat is None: return None, 0.0 proba model.predict_proba(feat)[0] gesture_id np.argmax(proba) confidence proba[gesture_id] return gesture_id, confidence六、前端页面templates/index.html直接使用HTMLJS调用摄像头每帧截图发送给后端。html!DOCTYPE html html head meta charsetUTF-8 title手势控制/title style body { font-family: Arial, sans-serif; text-align: center; background: #f0f2f5; } video { width: 640px; height: 480px; border: 2px solid #333; border-radius: 10px; } #result { font-size: 24px; margin-top: 20px; } .action { color: green; } /style /head body h1✋ 手势控制系统/h1 video idvideo autoplay playsinline/video div idresult识别结果span idgesture等待.../span/div div idaction执行操作span idactionText无/span/div script const video document.getElementById(video); const gestureSpan document.getElementById(gesture); const actionSpan document.getElementById(actionText); // 获取摄像头 navigator.mediaDevices.getUserMedia({ video: true }) .then(stream { video.srcObject stream; }) .catch(err alert(无法访问摄像头)); // 每隔200ms发送一帧 setInterval(() { const canvas document.createElement(canvas); canvas.width video.videoWidth; canvas.height video.videoHeight; const ctx canvas.getContext(2d); ctx.drawImage(video, 0, 0); const imageBase64 canvas.toDataURL(image/jpeg); fetch(/predict, { method: POST, headers: { Content-Type: application/json }, body: JSON.stringify({ image: imageBase64 }) }) .then(res res.json()) .then(data { if (data.gesture ! unknown) { gestureSpan.textContent data.gesture; if (data.action) { actionSpan.textContent data.action; actionSpan.style.color green; } else { actionSpan.textContent 无; actionSpan.style.color gray; } } }) .catch(err console.error(err)); }, 200); /script /body /html七、运行与测试训练模型先运行train_model.py需提前下载Kaggle数据集并解压到data/目录生成gesture_model.pkl。安装依赖pip install -r requirements.txt启动后端python app.py浏览器访问http://localhost:5000允许摄像头权限即可实时识别。当识别到手势且置信度0.7时会自动模拟键盘按键音量、播放等。八、PPT翻页与更多操作除了音量、音乐控制你还可以映射为左右方向键用于PPT翻页或上下键滚动。修改GESTURE_ACTION字典即可例如pythonGESTURE_ACTION { 0: left, # 握拳 - 左翻页 1: right, # OK - 右翻页 2: up, # 比心 - 上翻 3: down, # 大拇指 - 下翻 }然后执行pyautogui.press(action)。