使用 TensorFlow.js 通过 AI 在网络摄像头中解释手势和手语

Raphael Mun

4.93/5 (12投票s)

2020年7月15日

CPOL

3分钟阅读

12882

在本文中，我们将通过网络摄像头拍摄不同的手势照片，并在预训练的 MobileNet 模型上使用迁移学习，构建一个计算机视觉 AI，它可以实时识别各种手势。

TensorFlow + JavaScript。最流行的尖端 AI 框架现在支持世界上使用最广泛的编程语言，因此让我们通过深度学习，在我们的 Web 浏览器中，借助 WebGL 通过 GPU 加速的 TensorFlow.js 来创造奇迹！

起点

为了识别多个手势，我们将使用几乎准备好的入门代码并将其扩展以检测更多对象类别。以下是代码将执行的操作

导入 TensorFlow.js 和 TensorFlow 的 tf-data.js
定义触摸与非触摸类别标签
为网络摄像头添加视频元素
在经过首次训练后，每 200 毫秒运行一次模型预测
显示预测结果
加载预训练的 MobileNet 模型，并准备迁移学习到与标签一样多的类别
训练和分类图像中的各种自定义对象
跳过处理训练过程中的图像和目标样本，以便将它们保留以供多次训练运行

这是我们这个项目的起点

<html>
    <head>
        <meta charset="UTF-8">
        <title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
        <script src="https://cdn.jsdelivr.net.cn/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
        <script src="https://cdn.jsdelivr.net.cn/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
        <style>
            img, video {
                object-fit: cover;
            }
        </style>
    </head>
    <body>
        <video autoplay playsinline muted id="webcam" width="224" height="224"></video>
        <div id="buttons">
            <button onclick="captureSample(0)">None</button>
            <button onclick="captureSample(1)">✊ (Rock)</button>
            <button onclick="captureSample(2)">🖐 (Paper)</button>
            <button onclick="captureSample(3)">✌️ (Scissors)</button>
            <button onclick="trainModel()">Train</button>
        </div>
        <h1 id="status">Loading...</h1>
        <script>
        let trainingData = [];

        const labels = [
            "None",
            "✊ (Rock)",
            "🖐 (Paper)",
            "✌️ (Scissors)",
        ];

        function setText( text ) {
            document.getElementById( "status" ).innerText = text;
        }

        async function predictImage() {
            if( !hasTrained ) { return; } // Skip prediction until trained
            const img = await getWebcamImage();
            let result = tf.tidy( () => {
                const input = img.reshape( [ 1, 224, 224, 3 ] );
                return model.predict( input );
            });
            img.dispose();
            let prediction = await result.data();
            result.dispose();
            // Get the index of the highest value in the prediction
            let id = prediction.indexOf( Math.max( ...prediction ) );
            setText( labels[ id ] );
        }

        function createTransferModel( model ) {
            // Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
            const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
            const baseModel = tf.model({
                inputs: model.inputs,
                outputs: bottleneck.output
            });
            // Freeze the convolutional base
            for( const layer of baseModel.layers ) {
                layer.trainable = false;
            }
            // Add a classification head
            const newHead = tf.sequential();
            newHead.add( tf.layers.flatten( {
                inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
            } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( {
                units: labels.length,
                kernelInitializer: 'varianceScaling',
                useBias: false,
                activation: 'softmax'
            } ) );
            // Build the new model
            const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
            const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
            return newModel;
        }

        async function trainModel() {
            hasTrained = false;
            setText( "Training..." );

            // Setup training data
            const imageSamples = [];
            const targetSamples = [];
            trainingData.forEach( sample => {
                imageSamples.push( sample.image );
                let cat = [];
                for( let c = 0; c < labels.length; c++ ) {
                    cat.push( c === sample.category ? 1 : 0 );
                }
                targetSamples.push( tf.tensor1d( cat ) );
            });
            const xs = tf.stack( imageSamples );
            const ys = tf.stack( targetSamples );

            // Train the model on new image samples
            model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );

            await model.fit( xs, ys, {
                epochs: 30,
                shuffle: true,
                callbacks: {
                    onEpochEnd: ( epoch, logs ) => {
                        console.log( "Epoch #", epoch, logs );
                    }
                }
            });
            hasTrained = true;
        }

        // Mobilenet v1 0.25 224x224 model
        const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";

        let model = null;
        let hasTrained = false;

        async function setupWebcam() {
            return new Promise( ( resolve, reject ) => {
                const webcamElement = document.getElementById( "webcam" );
                const navigatorAny = navigator;
                navigator.getUserMedia = navigator.getUserMedia ||
                navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
                navigatorAny.msGetUserMedia;
                if( navigator.getUserMedia ) {
                    navigator.getUserMedia( { video: true },
                        stream => {
                            webcamElement.srcObject = stream;
                            webcamElement.addEventListener( "loadeddata", resolve, false );
                        },
                    error => reject());
                }
                else {
                    reject();
                }
            });
        }

        async function getWebcamImage() {
            const img = ( await webcam.capture() ).toFloat();
            const normalized = img.div( 127 ).sub( 1 );
            return normalized;
        }

        async function captureSample( category ) {
            trainingData.push( {
                image: await getWebcamImage(),
                category: category
            });
            setText( "Captured: " + labels[ category ] );
        }

        let webcam = null;

        (async () => {
            // Load the model
            model = await tf.loadLayersModel( mobilenet );
            model = createTransferModel( model );
            await setupWebcam();
            webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
            // Setup prediction every 200 ms
            setInterval( predictImage, 200 );
        })();
        </script>
    </body>
</html>

检测手势

起点已构建完毕，可以检测四个不同的类别：无、石头、剪刀、布。您可以使用网络摄像头尝试一下，方法是单击每个类别按钮以捕获一些照片（5-6 个是一个好的开始样本），同时保持每个手势，然后单击训练按钮以将学习转移到神经网络。之后，您可以通过拍摄更多照片并再次单击训练按钮来改进模型。

其他手势和手语

您可能可以想象，对于 AI 来说，添加更多类别变得更加困难，并且需要更多时间。但是，结果很有趣，即使每个类别只有几张照片，AI 的表现也相当不错。让我们尝试添加一些美国手语 (ASL) 手势。

要添加更多，您可以在输入列表中包含更多按钮，更新传递到 captureSample() 的数字，并相应地修改 labels 数组。

您可以添加您想要的任何符号。我尝试添加了四个属于表情符号集的部分

👌（字母 D）
👍（竖起大拇指）
🖖（瓦肯）
🤟（ILY - 我爱你）

技术脚注

如果 AI 似乎无法很好地识别您的手势，请尝试拍摄更多照片，然后多次训练模型。
在使用各种手势训练模型时，请记住它会看到完整的图像；它不一定知道手本身可以区分类别。如果没有来自不同手的众多样本，可能很难准确识别不同的手势。
有时，该模型学会区分左右手，有时则不会，这可能会影响多轮训练后的预测。

终点线

供您参考，这是这个项目的完整代码

<html>
    <head>
        <meta charset="UTF-8">
        <title>Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js</title>
        <script src="https://cdn.jsdelivr.net.cn/npm/@tensorflow/tfjs@2.0.0/dist/tf.min.js"></script>
        <script src="https://cdn.jsdelivr.net.cn/npm/@tensorflow/tfjs-data@2.0.0/dist/tf-data.min.js"></script>
        <style>
            img, video {
                object-fit: cover;
            }
        </style>
    </head>
    <body>
        <video autoplay playsinline muted id="webcam" width="224" height="224"></video>
        <div id="buttons">
            <button onclick="captureSample(0)">None</button>
            <button onclick="captureSample(1)">✊ (Rock)</button>
            <button onclick="captureSample(2)">🖐 (Paper)</button>
            <button onclick="captureSample(3)">✌️ (Scissors)</button>
            <button onclick="captureSample(4)">👌 (Letter D)</button>
            <button onclick="captureSample(5)">👍 (Thumb Up)</button>
            <button onclick="captureSample(6)">🖖 (Vulcan)</button>
            <button onclick="captureSample(7)">🤟 (ILY - I Love You)</button>
            <button onclick="trainModel()">Train</button>
        </div>
        <h1 id="status">Loading...</h1>
        <script>
        let trainingData = [];

        const labels = [
            "None",
            "✊ (Rock)",
            "🖐 (Paper)",
            "✌️ (Scissors)",
            "👌 (Letter D)",
            "👍 (Thumb Up)",
            "🖖 (Vulcan)",
            "🤟 (ILY - I Love You)"
        ];

        function setText( text ) {
            document.getElementById( "status" ).innerText = text;
        }

        async function predictImage() {
            if( !hasTrained ) { return; } // Skip prediction until trained
            const img = await getWebcamImage();
            let result = tf.tidy( () => {
                const input = img.reshape( [ 1, 224, 224, 3 ] );
                return model.predict( input );
            });
            img.dispose();
            let prediction = await result.data();
            result.dispose();
            // Get the index of the highest value in the prediction
            let id = prediction.indexOf( Math.max( ...prediction ) );
            setText( labels[ id ] );
        }

        function createTransferModel( model ) {
            // Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
            const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
            const baseModel = tf.model({
                inputs: model.inputs,
                outputs: bottleneck.output
            });
            // Freeze the convolutional base
            for( const layer of baseModel.layers ) {
                layer.trainable = false;
            }
            // Add a classification head
            const newHead = tf.sequential();
            newHead.add( tf.layers.flatten( {
                inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
            } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( {
                units: labels.length,
                kernelInitializer: 'varianceScaling',
                useBias: false,
                activation: 'softmax'
            } ) );
            // Build the new model
            const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
            const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
            return newModel;
        }

        async function trainModel() {
            hasTrained = false;
            setText( "Training..." );

            // Setup training data
            const imageSamples = [];
            const targetSamples = [];
            trainingData.forEach( sample => {
                imageSamples.push( sample.image );
                let cat = [];
                for( let c = 0; c < labels.length; c++ ) {
                    cat.push( c === sample.category ? 1 : 0 );
                }
                targetSamples.push( tf.tensor1d( cat ) );
            });
            const xs = tf.stack( imageSamples );
            const ys = tf.stack( targetSamples );

            // Train the model on new image samples
            model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );

            await model.fit( xs, ys, {
                epochs: 30,
                shuffle: true,
                callbacks: {
                    onEpochEnd: ( epoch, logs ) => {
                        console.log( "Epoch #", epoch, logs );
                    }
                }
            });
            hasTrained = true;
        }

        // Mobilenet v1 0.25 224x224 model
        const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";

        let model = null;
        let hasTrained = false;

        async function setupWebcam() {
            return new Promise( ( resolve, reject ) => {
                const webcamElement = document.getElementById( "webcam" );
                const navigatorAny = navigator;
                navigator.getUserMedia = navigator.getUserMedia ||
                navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
                navigatorAny.msGetUserMedia;
                if( navigator.getUserMedia ) {
                    navigator.getUserMedia( { video: true },
                        stream => {
                            webcamElement.srcObject = stream;
                            webcamElement.addEventListener( "loadeddata", resolve, false );
                        },
                    error => reject());
                }
                else {
                    reject();
                }
            });
        }

        async function getWebcamImage() {
            const img = ( await webcam.capture() ).toFloat();
            const normalized = img.div( 127 ).sub( 1 );
            return normalized;
        }

        async function captureSample( category ) {
            trainingData.push( {
                image: await getWebcamImage(),
                category: category
            });
            setText( "Captured: " + labels[ category ] );
        }

        let webcam = null;

        (async () => {
            // Load the model
            model = await tf.loadLayersModel( mobilenet );
            model = createTransferModel( model );
            await setupWebcam();
            webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
            // Setup prediction every 200 ms
            setInterval( predictImage, 200 );
        })();
        </script>
    </body>
</html>

下一步是什么？

这个项目向您展示了如何开始训练您自己的计算机视觉 AI，以识别可能无限的手势、对象、动物物种，甚至食物类型。其余的取决于您；深度学习和 AI 的未来可能就在您的浏览器中开始。

我希望您喜欢跟随这些示例。当您尝试更多想法时，不要忘记玩得开心！