Skip to content

Visual Wake Word on ESP 32

Preface

This example is based on the MLPerf Tiny Visual Wake Word (VWW) benchmark by using COCO2014 dataset. The classification task in to predict if there is person(s) in the image (the “person vs non-person” benchmark). The paper can be found here.

MobileNetV1 (×0.25)

The architecture is based on the MLPerf Tiny paper, which is slightly different than the original MobileNetV1 paper.

Layer # Type S P Weight file (dims) Bias file (dims)
01 Conv 3×3 (Cin=3, Cout=8) 2 1 w01.txt (3×3×3×8) b01.txt (8)
02 Depthwise 3×3 (Cin=8, M=1) 1 1 w02.txt (3×3×8) b02.txt (8)
03 Conv 1×1 (Cin=8, Cout=16) 1 0 w03.txt (1×1×8×16) b03.txt (16)
04 Depthwise 3×3 (Cin=16, M=1) 2 1 w04.txt (3×3×16) b04.txt (16)
05 Conv 1×1 (Cin=16, Cout=32) 1 0 w05.txt (1×1×16×32) b05.txt (32)
06 Depthwise 3×3 (Cin=32, M=1) 1 1 w06.txt (3×3×32) b06.txt (32)
07 Conv 1×1 (Cin=32, Cout=32) 1 0 w07.txt (1×1×32×32) b07.txt (32)
08 Depthwise 3×3 (Cin=32, M=1) 2 1 w08.txt (3×3×32) b08.txt (32)
09 Conv 1×1 (Cin=32, Cout=64) 1 0 w09.txt (1×1×32×64) b09.txt (64)
10 Depthwise 3×3 (Cin=64, M=1) 1 1 w10.txt (3×3×64) b10.txt (64)
11 Conv 1×1 (Cin=64, Cout=64) 1 0 w11.txt (1×1×64×64) b11.txt (64)
12 Depthwise 3×3 (Cin=64, M=1) 2 1 w12.txt (3×3×64) b12.txt (64)
13 Conv 1×1 (Cin=64, Cout=128) 1 0 w13.txt (1×1×64×128) b13.txt (128)
14 Depthwise 3×3 (Cin=128, M=1) 1 1 w14.txt (3×3×128) b14.txt (128)
15 Conv 1×1 (Cin=128, Cout=128) 1 0 w15.txt (1×1×128×128) b15.txt (128)
16 Depthwise 3×3 (Cin=128, M=1) 1 1 w16.txt (3×3×128) b16.txt (128)
17 Conv 1×1 (Cin=128, Cout=128) 1 0 w17.txt (1×1×128×128) b17.txt (128)
18 Depthwise 3×3 (Cin=128, M=1) 1 1 w18.txt (3×3×128) b18.txt (128)
19 Conv 1×1 (Cin=128, Cout=128) 1 0 w19.txt (1×1×128×128) b19.txt (128)
20 Depthwise 3×3 (Cin=128, M=1) 1 1 w20.txt (3×3×128) b20.txt (128)
21 Conv 1×1 (Cin=128, Cout=128) 1 0 w21.txt (1×1×128×128) b21.txt (128)
22 Depthwise 3×3 (Cin=128, M=1) 1 1 w22.txt (3×3×128) b22.txt (128)
23 Conv 1×1 (Cin=128, Cout=128) 1 0 w23.txt (1×1×128×128) b23.txt (128)
24 Depthwise 3×3 (Cin=128, M=1) 2 1 w24.txt (3×3×128) b24.txt (128)
25 Conv 1×1 (Cin=128, Cout=256) 1 0 w25.txt (1×1×128×256) b25.txt (256)
26 Depthwise 3×3 (Cin=256, M=1) 1 1 w26.txt (3×3×256) b26.txt (256)
27 Conv 1×1 (Cin=256, Cout=256) 1 0 w27.txt (1×1×256×256) b27.txt (256)
28 FC (in=256, out=2) w28.txt (2×256) b28.txt (2)

Download the image here.

Layer-by-layer implementation

We will perform all operations with variables. Test images are sent via serial communication from a Python program running in a PC. The inference time is about 3.7 seconds per image.

Pool POOL_ID; POOL_ID.M = 1; POOL_ID.T = 1;

uint16_t W = IN_W;
uint16_t V = 0;

ConvMem c00; c00.K=3; c00.P=AUTO; c00.S=2; c00.weight=w01; c00.bias=b01; c00.act=ACT_RELU;
V = noodle_conv_float(B, 3, 8, A, W, c00, POOL_ID, nullptr); W = V;

ConvMem d01; d01.K=3; d01.P=1; d01.S=1; d01.weight=w02; d01.bias=b02; d01.act=ACT_RELU;
V = noodle_dwconv_float(A, 8, B, W, d01, POOL_ID, nullptr); W = V;

ConvMem c02; c02.K=1; c02.P=0; c02.S=1; c02.weight=w03; c02.bias=b03; c02.act=ACT_RELU;
V = noodle_conv_float(B, 8, 16, A, W, c02, POOL_ID, nullptr); W = V;

ConvMem d03; d03.K=3; d03.P=AUTO; d03.S=2; d03.weight=w04; d03.bias=b04; d03.act=ACT_RELU;
V = noodle_dwconv_float(A, 16, B, W, d03, POOL_ID, nullptr); W = V;

ConvMem c04; c04.K=1; c04.P=0; c04.S=1; c04.weight=w05; c04.bias=b05; c04.act=ACT_RELU;
V = noodle_conv_float(B, 16, 32, A, W, c04, POOL_ID, nullptr); W = V;

ConvMem d05; d05.K=3; d05.P=1; d05.S=1; d05.weight=w06; d05.bias=b06; d05.act=ACT_RELU;
V = noodle_dwconv_float(A, 32, B, W, d05, POOL_ID, nullptr); W = V;

ConvMem c06; c06.K=1; c06.P=0; c06.S=1; c06.weight=w07; c06.bias=b07; c06.act=ACT_RELU;
V = noodle_conv_float(B, 32, 32, A, W, c06, POOL_ID, nullptr); W = V;

ConvMem d07; d07.K=3; d07.P=AUTO; d07.S=2; d07.weight=w08; d07.bias=b08; d07.act=ACT_RELU;
V = noodle_dwconv_float(A, 32, B, W, d07, POOL_ID, nullptr); W = V;
//Serial.println(V);

ConvMem c08; c08.K=1; c08.P=0; c08.S=1; c08.weight=w09; c08.bias=b09; c08.act=ACT_RELU;
V = noodle_conv_float(B, 32, 64, A, W, c08, POOL_ID, nullptr); W = V;

ConvMem d09; d09.K=3; d09.P=1; d09.S=1; d09.weight=w10; d09.bias=b10; d09.act=ACT_RELU;
V = noodle_dwconv_float(A, 64, B, W, d09, POOL_ID, nullptr); W = V;

ConvMem c10; c10.K=1; c10.P=0; c10.S=1; c10.weight=w11; c10.bias=b11; c10.act=ACT_RELU;
V = noodle_conv_float(B, 64, 64, A, W, c10, POOL_ID, nullptr); W = V;

ConvMem d11; d11.K=3; d11.P=AUTO; d11.S=2; d11.weight=w12; d11.bias=b12; d11.act=ACT_RELU;
V = noodle_dwconv_float(A, 64, B, W, d11, POOL_ID, nullptr); W = V;

ConvMem c12; c12.K=1; c12.P=0; c12.S=1; c12.weight=w13; c12.bias=b13; c12.act=ACT_RELU;
V = noodle_conv_float(B, 64, 128, A, W, c12, POOL_ID, nullptr); W = V;

ConvMem d13; d13.K=3; d13.P=1; d13.S=1; d13.weight=w14; d13.bias=b14; d13.act=ACT_RELU;
V = noodle_dwconv_float(A, 128, B, W, d13, POOL_ID, nullptr); W = V;

ConvMem c14; c14.K=1; c14.P=0; c14.S=1; c14.weight=w15; c14.bias=b15; c14.act=ACT_RELU;
V = noodle_conv_float(B, 128, 128, A, W, c14, POOL_ID, nullptr); W = V;

ConvMem d15; d15.K=3; d15.P=1; d15.S=1; d15.weight=w16; d15.bias=b16; d15.act=ACT_RELU;
V = noodle_dwconv_float(A, 128, B, W, d15, POOL_ID, nullptr); W = V;

ConvMem c16; c16.K=1; c16.P=0; c16.S=1; c16.weight=w17; c16.bias=b17; c16.act=ACT_RELU;
V = noodle_conv_float(B, 128, 128, A, W, c16, POOL_ID, nullptr); W = V;

ConvMem d17; d17.K=3; d17.P=1; d17.S=1; d17.weight=w18; d17.bias=b18; d17.act=ACT_RELU;
V = noodle_dwconv_float(A, 128, B, W, d17, POOL_ID, nullptr); W = V;

ConvMem c18; c18.K=1; c18.P=0; c18.S=1; c18.weight=w19; c18.bias=b19; c18.act=ACT_RELU;
V = noodle_conv_float(B, 128, 128, A, W, c18, POOL_ID, nullptr); W = V;

ConvMem d19; d19.K=3; d19.P=1; d19.S=1; d19.weight=w20; d19.bias=b20; d19.act=ACT_RELU;
V = noodle_dwconv_float(A, 128, B, W, d19, POOL_ID, nullptr); W = V;

ConvMem c20; c20.K=1; c20.P=0; c20.S=1; c20.weight=w21; c20.bias=b21; c20.act=ACT_RELU;
V = noodle_conv_float(B, 128, 128, A, W, c20, POOL_ID, nullptr); W = V;

ConvMem d21; d21.K=3; d21.P=1; d21.S=1; d21.weight=w22; d21.bias=b22; d21.act=ACT_RELU;
V = noodle_dwconv_float(A, 128, B, W, d21, POOL_ID, nullptr); W = V;

ConvMem c22; c22.K=1; c22.P=0; c22.S=1; c22.weight=w23; c22.bias=b23; c22.act=ACT_RELU;
V = noodle_conv_float(B, 128, 128, A, W, c22, POOL_ID, nullptr); W = V;

ConvMem d23; d23.K=3; d23.P=AUTO; d23.S=2; d23.weight=w24; d23.bias=b24; d23.act=ACT_RELU;
V = noodle_dwconv_float(A, 128, B, W, d23, POOL_ID, nullptr); W = V;

ConvMem c24; c24.K=1; c24.P=0; c24.S=1; c24.weight=w25; c24.bias=b25; c24.act=ACT_RELU;
V = noodle_conv_float(B, 128, 256, A, W, c24, POOL_ID, nullptr); W = V;

ConvMem d25; d25.K=3; d25.P=1; d25.S=1; d25.weight=w26; d25.bias=b26; d25.act=ACT_RELU;
V = noodle_dwconv_float(A, 256, B, W, d25, POOL_ID, nullptr); W = V;
//Serial.println(V);

ConvMem c26; c26.K=1; c26.P=0; c26.S=1; c26.weight=w27; c26.bias=b27; c26.act=ACT_RELU;
V = noodle_conv_float(B, 256, 256, A, W, c26, POOL_ID, nullptr); W = V;

uint16_t C = noodle_gap(A, 256, W);

float out2[2];
FCNMem fcf; fcf.weight = w28; fcf.bias = b28; fcf.act = ACT_SOFTMAX;
(void)noodle_fcn((const float*)A, 256, 2, out2, fcf, nullptr);