-
Notifications
You must be signed in to change notification settings - Fork 4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
stream.wasm : add web-based real-time transcription (#112)
- Loading branch information
Showing
9 changed files
with
679 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# | ||
# libstream | ||
# | ||
|
||
set(TARGET libstream) | ||
|
||
add_executable(${TARGET} | ||
emscripten.cpp | ||
) | ||
|
||
target_link_libraries(${TARGET} PRIVATE | ||
whisper | ||
) | ||
|
||
unset(EXTRA_FLAGS) | ||
|
||
if (WHISPER_WASM_SINGLE_FILE) | ||
set(EXTRA_FLAGS "-s SINGLE_FILE=1") | ||
message(STATUS "Embedding WASM inside stream.js") | ||
|
||
add_custom_command( | ||
TARGET ${TARGET} POST_BUILD | ||
COMMAND ${CMAKE_COMMAND} -E copy | ||
${CMAKE_BINARY_DIR}/bin/libstream.js | ||
${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/stream.wasm/stream.js | ||
) | ||
endif() | ||
|
||
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \ | ||
--bind \ | ||
-s USE_PTHREADS=1 \ | ||
-s PTHREAD_POOL_SIZE=8 \ | ||
-s INITIAL_MEMORY=1024MB \ | ||
-s TOTAL_MEMORY=1024MB \ | ||
-s FORCE_FILESYSTEM=1 \ | ||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \ | ||
${EXTRA_FLAGS} \ | ||
") | ||
|
||
# | ||
# stream.wasm | ||
# | ||
|
||
set(TARGET stream.wasm) | ||
|
||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY) | ||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# stream.wasm | ||
|
||
Real-time transcription in the browser using WebAssembly | ||
|
||
Online demo: https://whisper.ggerganov.com/stream/ | ||
|
||
## Build instructions | ||
|
||
```bash | ||
# build using Emscripten (v3.1.2) | ||
git clone https://github.com/ggerganov/whisper.cpp | ||
cd whisper.cpp | ||
mkdir build-em && cd build-em | ||
emcmake cmake .. | ||
make -j | ||
|
||
# copy the produced page to your HTTP path | ||
cp bin/stream.wasm/* /path/to/html/ | ||
cp bin/libstream.worker.js /path/to/html/ | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
#include "ggml.h" | ||
#include "whisper.h" | ||
|
||
#include <emscripten.h> | ||
#include <emscripten/bind.h> | ||
|
||
#include <atomic> | ||
#include <cmath> | ||
#include <mutex> | ||
#include <string> | ||
#include <thread> | ||
#include <vector> | ||
|
||
constexpr int N_THREAD = 8; | ||
|
||
std::vector<struct whisper_context *> g_contexts(4, nullptr); | ||
|
||
std::mutex g_mutex; | ||
std::thread g_worker; | ||
|
||
std::atomic<bool> g_running(false); | ||
|
||
std::string g_status = ""; | ||
std::string g_status_forced = ""; | ||
std::string g_transcribed = ""; | ||
|
||
std::vector<float> g_pcmf32; | ||
|
||
void stream_set_status(const std::string & status) { | ||
std::lock_guard<std::mutex> lock(g_mutex); | ||
g_status = status; | ||
} | ||
|
||
void stream_main(size_t index) { | ||
stream_set_status("loading data ..."); | ||
|
||
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY); | ||
|
||
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency()); | ||
wparams.offset_ms = 0; | ||
wparams.translate = false; | ||
wparams.no_context = true; | ||
wparams.single_segment = true; | ||
wparams.print_realtime = false; | ||
wparams.print_progress = false; | ||
wparams.print_timestamps = true; | ||
wparams.print_special = false; | ||
|
||
wparams.max_tokens = 32; | ||
wparams.audio_ctx = 768; // partial encoder context for better performance | ||
|
||
wparams.language = "en"; | ||
|
||
printf("stream: using %d threads\n", N_THREAD); | ||
|
||
std::vector<float> pcmf32; | ||
|
||
// whisper context | ||
auto & ctx = g_contexts[index]; | ||
|
||
// 5 seconds interval | ||
const int64_t window_samples = 5*WHISPER_SAMPLE_RATE; | ||
|
||
while (g_running) { | ||
stream_set_status("waiting for audio ..."); | ||
|
||
{ | ||
std::unique_lock<std::mutex> lock(g_mutex); | ||
|
||
if (g_pcmf32.size() < 1024) { | ||
lock.unlock(); | ||
|
||
std::this_thread::sleep_for(std::chrono::milliseconds(10)); | ||
|
||
continue; | ||
} | ||
|
||
pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end()); | ||
g_pcmf32.clear(); | ||
} | ||
|
||
{ | ||
const auto t_start = std::chrono::high_resolution_clock::now(); | ||
|
||
stream_set_status("running whisper ..."); | ||
|
||
int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()); | ||
if (ret != 0) { | ||
printf("whisper_full() failed: %d\n", ret); | ||
break; | ||
} | ||
|
||
const auto t_end = std::chrono::high_resolution_clock::now(); | ||
|
||
printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count()); | ||
} | ||
|
||
{ | ||
std::string text_heard; | ||
|
||
{ | ||
const int n_segments = whisper_full_n_segments(ctx); | ||
for (int i = n_segments - 1; i < n_segments; ++i) { | ||
const char * text = whisper_full_get_segment_text(ctx, i); | ||
|
||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i); | ||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i); | ||
|
||
printf("transcribed: %s\n", text); | ||
|
||
text_heard += text; | ||
} | ||
} | ||
|
||
{ | ||
std::lock_guard<std::mutex> lock(g_mutex); | ||
g_transcribed = text_heard; | ||
} | ||
} | ||
} | ||
|
||
if (index < g_contexts.size()) { | ||
whisper_free(g_contexts[index]); | ||
g_contexts[index] = nullptr; | ||
} | ||
} | ||
|
||
EMSCRIPTEN_BINDINGS(stream) { | ||
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { | ||
for (size_t i = 0; i < g_contexts.size(); ++i) { | ||
if (g_contexts[i] == nullptr) { | ||
g_contexts[i] = whisper_init(path_model.c_str()); | ||
if (g_contexts[i] != nullptr) { | ||
g_running = true; | ||
if (g_worker.joinable()) { | ||
g_worker.join(); | ||
} | ||
g_worker = std::thread([i]() { | ||
stream_main(i); | ||
}); | ||
|
||
return i + 1; | ||
} else { | ||
return (size_t) 0; | ||
} | ||
} | ||
} | ||
|
||
return (size_t) 0; | ||
})); | ||
|
||
emscripten::function("free", emscripten::optional_override([](size_t index) { | ||
if (g_running) { | ||
g_running = false; | ||
} | ||
})); | ||
|
||
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) { | ||
--index; | ||
|
||
if (index >= g_contexts.size()) { | ||
return -1; | ||
} | ||
|
||
if (g_contexts[index] == nullptr) { | ||
return -2; | ||
} | ||
|
||
{ | ||
std::lock_guard<std::mutex> lock(g_mutex); | ||
const int n = audio["length"].as<int>(); | ||
|
||
emscripten::val heap = emscripten::val::module_property("HEAPU8"); | ||
emscripten::val memory = heap["buffer"]; | ||
|
||
g_pcmf32.resize(n); | ||
|
||
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n); | ||
memoryView.call<void>("set", audio); | ||
} | ||
|
||
return 0; | ||
})); | ||
|
||
emscripten::function("get_transcribed", emscripten::optional_override([]() { | ||
std::string transcribed; | ||
|
||
{ | ||
std::lock_guard<std::mutex> lock(g_mutex); | ||
transcribed = std::move(g_transcribed); | ||
} | ||
|
||
return transcribed; | ||
})); | ||
|
||
emscripten::function("get_status", emscripten::optional_override([]() { | ||
std::string status; | ||
|
||
{ | ||
std::lock_guard<std::mutex> lock(g_mutex); | ||
status = g_status_forced.empty() ? g_status : g_status_forced; | ||
} | ||
|
||
return status; | ||
})); | ||
|
||
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) { | ||
{ | ||
std::lock_guard<std::mutex> lock(g_mutex); | ||
g_status_forced = status; | ||
} | ||
})); | ||
} |
Oops, something went wrong.