Skip to content

Commit

Permalink
stream.wasm : add web-based real-time transcription (#112)
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Nov 25, 2022
1 parent be16dfa commit 3c390ff
Show file tree
Hide file tree
Showing 9 changed files with 679 additions and 9 deletions.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})

if (EMSCRIPTEN)
add_subdirectory(whisper.wasm)
add_subdirectory(stream.wasm)
add_subdirectory(talk.wasm)
else()
add_subdirectory(main)
Expand Down
6 changes: 6 additions & 0 deletions examples/helpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ var printTextarea = (function() {
};
})();

async function clearCache() {
if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
indexedDB.deleteDatabase(dbName);
}
}

// fetch a remote file from remote URL using the Fetch API
async function fetchRemote(url, cbProgress, cbPrint) {
cbPrint('fetchRemote: downloading with fetch()...');
Expand Down
47 changes: 47 additions & 0 deletions examples/stream.wasm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# libstream
#

set(TARGET libstream)

add_executable(${TARGET}
emscripten.cpp
)

target_link_libraries(${TARGET} PRIVATE
whisper
)

unset(EXTRA_FLAGS)

if (WHISPER_WASM_SINGLE_FILE)
set(EXTRA_FLAGS "-s SINGLE_FILE=1")
message(STATUS "Embedding WASM inside stream.js")

add_custom_command(
TARGET ${TARGET} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/bin/libstream.js
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js
)
endif()

set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \
-s INITIAL_MEMORY=1024MB \
-s TOTAL_MEMORY=1024MB \
-s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \
")

#
# stream.wasm
#

set(TARGET stream.wasm)

configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
20 changes: 20 additions & 0 deletions examples/stream.wasm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# stream.wasm

Real-time transcription in the browser using WebAssembly

Online demo: https://whisper.ggerganov.com/stream/

## Build instructions

```bash
# build using Emscripten (v3.1.2)
git clone https://github.com/ggerganov/whisper.cpp
cd whisper.cpp
mkdir build-em && cd build-em
emcmake cmake ..
make -j

# copy the produced page to your HTTP path
cp bin/stream.wasm/* /path/to/html/
cp bin/libstream.worker.js /path/to/html/
```
213 changes: 213 additions & 0 deletions examples/stream.wasm/emscripten.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#include "ggml.h"
#include "whisper.h"

#include <emscripten.h>
#include <emscripten/bind.h>

#include <atomic>
#include <cmath>
#include <mutex>
#include <string>
#include <thread>
#include <vector>

constexpr int N_THREAD = 8;

std::vector<struct whisper_context *> g_contexts(4, nullptr);

std::mutex g_mutex;
std::thread g_worker;

std::atomic<bool> g_running(false);

std::string g_status = "";
std::string g_status_forced = "";
std::string g_transcribed = "";

std::vector<float> g_pcmf32;

void stream_set_status(const std::string & status) {
std::lock_guard<std::mutex> lock(g_mutex);
g_status = status;
}

void stream_main(size_t index) {
stream_set_status("loading data ...");

struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);

wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
wparams.offset_ms = 0;
wparams.translate = false;
wparams.no_context = true;
wparams.single_segment = true;
wparams.print_realtime = false;
wparams.print_progress = false;
wparams.print_timestamps = true;
wparams.print_special = false;

wparams.max_tokens = 32;
wparams.audio_ctx = 768; // partial encoder context for better performance

wparams.language = "en";

printf("stream: using %d threads\n", N_THREAD);

std::vector<float> pcmf32;

// whisper context
auto & ctx = g_contexts[index];

// 5 seconds interval
const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;

while (g_running) {
stream_set_status("waiting for audio ...");

{
std::unique_lock<std::mutex> lock(g_mutex);

if (g_pcmf32.size() < 1024) {
lock.unlock();

std::this_thread::sleep_for(std::chrono::milliseconds(10));

continue;
}

pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
g_pcmf32.clear();
}

{
const auto t_start = std::chrono::high_resolution_clock::now();

stream_set_status("running whisper ...");

int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
if (ret != 0) {
printf("whisper_full() failed: %d\n", ret);
break;
}

const auto t_end = std::chrono::high_resolution_clock::now();

printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
}

{
std::string text_heard;

{
const int n_segments = whisper_full_n_segments(ctx);
for (int i = n_segments - 1; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);

const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

printf("transcribed: %s\n", text);

text_heard += text;
}
}

{
std::lock_guard<std::mutex> lock(g_mutex);
g_transcribed = text_heard;
}
}
}

if (index < g_contexts.size()) {
whisper_free(g_contexts[index]);
g_contexts[index] = nullptr;
}
}

EMSCRIPTEN_BINDINGS(stream) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init(path_model.c_str());
if (g_contexts[i] != nullptr) {
g_running = true;
if (g_worker.joinable()) {
g_worker.join();
}
g_worker = std::thread([i]() {
stream_main(i);
});

return i + 1;
} else {
return (size_t) 0;
}
}
}

return (size_t) 0;
}));

emscripten::function("free", emscripten::optional_override([](size_t index) {
if (g_running) {
g_running = false;
}
}));

emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
--index;

if (index >= g_contexts.size()) {
return -1;
}

if (g_contexts[index] == nullptr) {
return -2;
}

{
std::lock_guard<std::mutex> lock(g_mutex);
const int n = audio["length"].as<int>();

emscripten::val heap = emscripten::val::module_property("HEAPU8");
emscripten::val memory = heap["buffer"];

g_pcmf32.resize(n);

emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
memoryView.call<void>("set", audio);
}

return 0;
}));

emscripten::function("get_transcribed", emscripten::optional_override([]() {
std::string transcribed;

{
std::lock_guard<std::mutex> lock(g_mutex);
transcribed = std::move(g_transcribed);
}

return transcribed;
}));

emscripten::function("get_status", emscripten::optional_override([]() {
std::string status;

{
std::lock_guard<std::mutex> lock(g_mutex);
status = g_status_forced.empty() ? g_status : g_status_forced;
}

return status;
}));

emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
{
std::lock_guard<std::mutex> lock(g_mutex);
g_status_forced = status;
}
}));
}
Loading

0 comments on commit 3c390ff

Please sign in to comment.