stream.wasm : add web-based real-time transcription (#112)

ggerganov · Nov 25, 2022 · 3c390ff · 3c390ff
1 parent be16dfa
commit 3c390ff
Show file tree

Hide file tree

Showing 9 changed files with 679 additions and 9 deletions.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -20,6 +20,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
 if (EMSCRIPTEN)
     add_subdirectory(whisper.wasm)
+    add_subdirectory(stream.wasm)
     add_subdirectory(talk.wasm)
 else()
     add_subdirectory(main)

diff --git a/examples/helpers.js b/examples/helpers.js
@@ -19,6 +19,12 @@ var printTextarea = (function() {
     };
 })();
 
+async function clearCache() {
+    if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
+        indexedDB.deleteDatabase(dbName);
+    }
+}
+
 // fetch a remote file from remote URL using the Fetch API
 async function fetchRemote(url, cbProgress, cbPrint) {
     cbPrint('fetchRemote: downloading with fetch()...');

diff --git a/examples/stream.wasm/CMakeLists.txt b/examples/stream.wasm/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# libstream
+#
+
+set(TARGET libstream)
+
+add_executable(${TARGET}
+    emscripten.cpp
+    )
+
+target_link_libraries(${TARGET} PRIVATE
+    whisper
+    )
+
+unset(EXTRA_FLAGS)
+
+if (WHISPER_WASM_SINGLE_FILE)
+    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
+    message(STATUS "Embedding WASM inside stream.js")
+
+    add_custom_command(
+        TARGET ${TARGET} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+        ${CMAKE_BINARY_DIR}/bin/libstream.js
+        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js
+        )
+endif()
+
+set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
+    --bind \
+    -s USE_PTHREADS=1 \
+    -s PTHREAD_POOL_SIZE=8 \
+    -s INITIAL_MEMORY=1024MB \
+    -s TOTAL_MEMORY=1024MB \
+    -s FORCE_FILESYSTEM=1 \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
+    ${EXTRA_FLAGS} \
+    ")
+
+#
+# stream.wasm
+#
+
+set(TARGET stream.wasm)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
diff --git a/examples/stream.wasm/README.md b/examples/stream.wasm/README.md
@@ -0,0 +1,20 @@
+# stream.wasm
+
+Real-time transcription in the browser using WebAssembly
+
+Online demo: https://whisper.ggerganov.com/stream/
+
+## Build instructions
+
+```bash
+# build using Emscripten (v3.1.2)
+git clone https://github.com/ggerganov/whisper.cpp
+cd whisper.cpp
+mkdir build-em && cd build-em
+emcmake cmake ..
+make -j
+
+# copy the produced page to your HTTP path
+cp bin/stream.wasm/*       /path/to/html/
+cp bin/libstream.worker.js /path/to/html/
+```
diff --git a/examples/stream.wasm/emscripten.cpp b/examples/stream.wasm/emscripten.cpp
@@ -0,0 +1,213 @@
+#include "ggml.h"
+#include "whisper.h"
+
+#include <emscripten.h>
+#include <emscripten/bind.h>
+
+#include <atomic>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+constexpr int N_THREAD = 8;
+
+std::vector<struct whisper_context *> g_contexts(4, nullptr);
+
+std::mutex g_mutex;
+std::thread g_worker;
+
+std::atomic<bool> g_running(false);
+
+std::string g_status        = "";
+std::string g_status_forced = "";
+std::string g_transcribed   = "";
+
+std::vector<float> g_pcmf32;
+
+void stream_set_status(const std::string & status) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_status = status;
+}
+
+void stream_main(size_t index) {
+    stream_set_status("loading data ...");
+
+    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
+
+    wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+    wparams.offset_ms        = 0;
+    wparams.translate        = false;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.print_realtime   = false;
+    wparams.print_progress   = false;
+    wparams.print_timestamps = true;
+    wparams.print_special    = false;
+
+    wparams.max_tokens       = 32;
+    wparams.audio_ctx        = 768; // partial encoder context for better performance
+
+    wparams.language         = "en";
+
+    printf("stream: using %d threads\n", N_THREAD);
+
+    std::vector<float> pcmf32;
+
+    // whisper context
+    auto & ctx = g_contexts[index];
+
+    // 5 seconds interval
+    const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;
+
+    while (g_running) {
+        stream_set_status("waiting for audio ...");
+
+        {
+            std::unique_lock<std::mutex> lock(g_mutex);
+
+            if (g_pcmf32.size() < 1024) {
+                lock.unlock();
+
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+                continue;
+            }
+
+            pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
+            g_pcmf32.clear();
+        }
+
+        {
+            const auto t_start = std::chrono::high_resolution_clock::now();
+
+            stream_set_status("running whisper ...");
+
+            int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
+            if (ret != 0) {
+                printf("whisper_full() failed: %d\n", ret);
+                break;
+            }
+
+            const auto t_end = std::chrono::high_resolution_clock::now();
+
+            printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
+        }
+
+        {
+            std::string text_heard;
+
+            {
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = n_segments - 1; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);
+
+                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                    printf("transcribed: %s\n", text);
+
+                    text_heard += text;
+                }
+            }
+
+            {
+                std::lock_guard<std::mutex> lock(g_mutex);
+                g_transcribed = text_heard;
+            }
+        }
+    }
+
+    if (index < g_contexts.size()) {
+        whisper_free(g_contexts[index]);
+        g_contexts[index] = nullptr;
+    }
+}
+
+EMSCRIPTEN_BINDINGS(stream) {
+    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
+        for (size_t i = 0; i < g_contexts.size(); ++i) {
+            if (g_contexts[i] == nullptr) {
+                g_contexts[i] = whisper_init(path_model.c_str());
+                if (g_contexts[i] != nullptr) {
+                    g_running = true;
+                    if (g_worker.joinable()) {
+                        g_worker.join();
+                    }
+                    g_worker = std::thread([i]() {
+                        stream_main(i);
+                    });
+
+                    return i + 1;
+                } else {
+                    return (size_t) 0;
+                }
+            }
+        }
+
+        return (size_t) 0;
+    }));
+
+    emscripten::function("free", emscripten::optional_override([](size_t index) {
+        if (g_running) {
+            g_running = false;
+        }
+    }));
+
+    emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
+        --index;
+
+        if (index >= g_contexts.size()) {
+            return -1;
+        }
+
+        if (g_contexts[index] == nullptr) {
+            return -2;
+        }
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            const int n = audio["length"].as<int>();
+
+            emscripten::val heap = emscripten::val::module_property("HEAPU8");
+            emscripten::val memory = heap["buffer"];
+
+            g_pcmf32.resize(n);
+
+            emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
+            memoryView.call<void>("set", audio);
+        }
+
+        return 0;
+    }));
+
+    emscripten::function("get_transcribed", emscripten::optional_override([]() {
+        std::string transcribed;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            transcribed = std::move(g_transcribed);
+        }
+
+        return transcribed;
+    }));
+
+    emscripten::function("get_status", emscripten::optional_override([]() {
+        std::string status;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            status = g_status_forced.empty() ? g_status : g_status_forced;
+        }
+
+        return status;
+    }));
+
+    emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            g_status_forced = status;
+        }
+    }));
+}