From 4e7e84ecf7e935c7058ff17a114297d816e82d8b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 20 Dec 2024 13:09:21 +0200
Subject: [PATCH] examples : remove talk and talk.wasm

---
 examples/CMakeLists.txt            |   4 -
 examples/talk.wasm/CMakeLists.txt  |  51 --
 examples/talk.wasm/README.md       |  74 ---
 examples/talk.wasm/emscripten.cpp  | 368 -------------
 examples/talk.wasm/gpt-2.cpp       | 808 ---------------------------
 examples/talk.wasm/gpt-2.h         |  21 -
 examples/talk.wasm/index-tmpl.html | 856 -----------------------------
 examples/talk/.gitignore           |   2 -
 examples/talk/CMakeLists.txt       |   8 -
 examples/talk/README.md            |  45 --
 examples/talk/eleven-labs.py       |  80 ---
 examples/talk/gpt-2.cpp            | 809 ---------------------------
 examples/talk/gpt-2.h              |  21 -
 examples/talk/speak                |  40 --
 examples/talk/speak.bat            |   1 -
 examples/talk/speak.ps1            |  14 -
 examples/talk/talk.cpp             | 376 -------------
 examples/talk/to_speak.txt         |   1 +
 18 files changed, 1 insertion(+), 3578 deletions(-)
 delete mode 100644 examples/talk.wasm/CMakeLists.txt
 delete mode 100644 examples/talk.wasm/README.md
 delete mode 100644 examples/talk.wasm/emscripten.cpp
 delete mode 100644 examples/talk.wasm/gpt-2.cpp
 delete mode 100644 examples/talk.wasm/gpt-2.h
 delete mode 100644 examples/talk.wasm/index-tmpl.html
 delete mode 100644 examples/talk/.gitignore
 delete mode 100644 examples/talk/CMakeLists.txt
 delete mode 100644 examples/talk/README.md
 delete mode 100644 examples/talk/eleven-labs.py
 delete mode 100644 examples/talk/gpt-2.cpp
 delete mode 100644 examples/talk/gpt-2.h
 delete mode 100644 examples/talk/speak
 delete mode 100644 examples/talk/speak.bat
 delete mode 100644 examples/talk/speak.ps1
 delete mode 100644 examples/talk/talk.cpp
 create mode 100644 examples/talk/to_speak.txt

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 736b542971c..3e03c95ec2d 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -99,7 +99,6 @@ if (EMSCRIPTEN)
     add_subdirectory(whisper.wasm)
     add_subdirectory(stream.wasm)
     add_subdirectory(command.wasm)
-    #add_subdirectory(talk.wasm)
     add_subdirectory(bench.wasm)
 elseif(CMAKE_JS_VERSION)
     add_subdirectory(addon.node)
@@ -115,9 +114,6 @@ endif (WHISPER_SDL2)
     add_subdirectory(bench)
     add_subdirectory(quantize)
 if (WHISPER_SDL2)
-    # TODO: disabled until update
-    #       https://github.com/ggerganov/whisper.cpp/issues/1818
-    #add_subdirectory(talk)
     add_subdirectory(talk-llama)
     add_subdirectory(lsp)
     if (GGML_SYCL)
diff --git a/examples/talk.wasm/CMakeLists.txt b/examples/talk.wasm/CMakeLists.txt
deleted file mode 100644
index 8f00eb488ba..00000000000
--- a/examples/talk.wasm/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-#
-# libtalk
-#
-
-set(TARGET libtalk)
-
-add_executable(${TARGET}
-    emscripten.cpp
-    gpt-2.cpp
-    )
-
-include(DefaultTargetOptions)
-
-target_link_libraries(${TARGET} PRIVATE
-    whisper
-    common
-    )
-
-unset(EXTRA_FLAGS)
-
-if (WHISPER_WASM_SINGLE_FILE)
-    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
-    message(STATUS "Embedding WASM inside talk.js")
-
-    add_custom_command(
-        TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy
-        ${CMAKE_BINARY_DIR}/bin/libtalk.js
-        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/talk.wasm/talk.js
-        )
-endif()
-
-set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
-    --bind \
-    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1800MB \
-    -s TOTAL_MEMORY=1800MB \
-    -s FORCE_FILESYSTEM=1 \
-    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
-    ${EXTRA_FLAGS} \
-    ")
-
-#
-# talk.wasm
-#
-
-set(TARGET talk.wasm)
-
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
diff --git a/examples/talk.wasm/README.md b/examples/talk.wasm/README.md
deleted file mode 100644
index e656fee71cc..00000000000
--- a/examples/talk.wasm/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# talk.wasm
-
-Talk with an Artificial Intelligence in your browser:
-
-[https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4](https://user-images.githubusercontent.com/1991296/203845553-f7b44e13-9a15-4fc8-b518-ae8f4c6770fe.mp4)
-
-Online demo: https://whisper.ggerganov.com/talk/
-
-Terminal version: [examples/talk](/examples/talk)
-
-## How it works?
-
-This demo leverages 2 modern neural network models to create a high-quality voice chat directly in your browser:
-
-- [OpenAI's Whisper](https://github.com/openai/whisper) speech recognition model is used to process your voice and understand what you are saying
-- Upon receiving some voice input, the AI generates a text response using [OpenAI's GPT-2](https://github.com/openai/gpt-2) language model
-- The AI then vocalizes the response using the browser's [Web Speech API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)
-
-The web page does the processing locally on your machine. The processing of these heavy neural network models in the
-browser is possible by implementing them efficiently in C/C++ and using the browser's WebAssembly SIMD capabilities for
-extra performance:
-
-- The Whisper C++ implementation is here: [whisper.h](/whisper.h) / [whisper.cpp](/whisper.cpp)
-- The GPT-2 C++ implementation is here: [gpt-2.h](gpt-2.h) / [gpt-2.cpp](gpt-2.cpp)
-- Both models use a custom tensor library implemented in C: [ggml.h](/ggml.h) / [ggml.c](/ggml.c)
-- The HTML/JS layer is here: [index-tmpl.html](index-tmpl.html)
-- The Emscripten bridge between C/C++ and JS is here: [emscripten.cpp](emscripten.cpp)
-
-In order to run the models, the web page first needs to download the model data which is about ~350 MB. The model data
-is then cached in your browser's cache and can be reused in future visits without downloading it again.
-
-## Requirements
-
-In order to run this demo efficiently, you need to have the following:
-
-- Latest Chrome or Firefox browser (Safari is not supported)
-- Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough)
-- Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
-- The web-page uses about 1.8GB of RAM
-
-Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
-Also, the prompting strategy can likely be improved to achieve better results.
-
-The demo is quite computationally heavy, so you need a fast CPU. It's not usual to run these transformer models in a
-browser. Typically, they run on powerful GPUs.
-
-Currently, mobile browsers do not support the Fixed-width SIMD WebAssembly capability, so you cannot run this demo
-on a phone or a tablet. Hopefully, in the near future this will become supported.
-
-## Todo
-
-- Better UI (contributions are welcome)
-- Better GPT-2 prompting
-
-## Build instructions
-
-```bash
-# build using Emscripten (v3.1.2)
-git clone https://github.com/ggerganov/whisper.cpp
-cd whisper.cpp
-mkdir build-em && cd build-em
-emcmake cmake ..
-make -j
-
-# copy the produced page to your HTTP path
-cp bin/talk.wasm/*       /path/to/html/
-cp bin/libtalk.worker.js /path/to/html/
-```
-
-## Feedback
-
-If you have any comments or ideas for improvement, please drop a comment in the following discussion:
-
-https://github.com/ggerganov/whisper.cpp/discussions/167
diff --git a/examples/talk.wasm/emscripten.cpp b/examples/talk.wasm/emscripten.cpp
deleted file mode 100644
index 53cb951e027..00000000000
--- a/examples/talk.wasm/emscripten.cpp
+++ /dev/null
@@ -1,368 +0,0 @@
-#include "ggml.h"
-#include "gpt-2.h"
-#include "whisper.h"
-
-#include <emscripten.h>
-#include <emscripten/bind.h>
-
-#include <atomic>
-#include <cmath>
-#include <mutex>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-
-constexpr int N_THREAD = 8;
-
-struct gpt2_context * g_gpt2;
-std::vector<struct whisper_context *> g_contexts(4, nullptr);
-
-std::mutex g_mutex;
-std::thread g_worker;
-std::atomic<bool> g_running(false);
-
-bool g_force_speak = false;
-std::string g_text_to_speak = "";
-std::string g_status = "";
-std::string g_status_forced = "";
-
-std::vector<float> g_pcmf32;
-
-void talk_set_status(const std::string & status) {
-    std::lock_guard<std::mutex> lock(g_mutex);
-    g_status = status;
-}
-
-void talk_main(size_t index) {
-    talk_set_status("loading data ...");
-
-    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
-
-    wparams.n_threads        = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
-    wparams.offset_ms        = 0;
-    wparams.translate        = false;
-    wparams.no_context       = true;
-    wparams.single_segment   = true;
-    wparams.print_realtime   = false;
-    wparams.print_progress   = false;
-    wparams.print_timestamps = true;
-    wparams.print_special    = false;
-
-    wparams.max_tokens       = 32;
-    wparams.audio_ctx        = 768; // partial encoder context for better performance
-
-    wparams.language         = "en";
-
-    g_gpt2 = gpt2_init("gpt-2.bin");
-
-    printf("talk: using %d threads\n", wparams.n_threads);
-
-    std::vector<float> pcmf32;
-
-    // whisper context
-    auto & ctx = g_contexts[index];
-
-    const int64_t step_samples   = 2*WHISPER_SAMPLE_RATE;
-    const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
-    const int64_t step_ms        = (step_samples*1000)/WHISPER_SAMPLE_RATE;
-
-    auto t_last = std::chrono::high_resolution_clock::now();
-
-    talk_set_status("listening ...");
-
-    while (g_running) {
-
-        const auto t_now = std::chrono::high_resolution_clock::now();
-        if (std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count() < step_ms) {
-            {
-                std::lock_guard<std::mutex> lock(g_mutex);
-                g_pcmf32.clear();
-            }
-            std::this_thread::sleep_for(std::chrono::milliseconds(10));
-            continue;
-        }
-
-        talk_set_status("listening ...");
-
-        {
-            std::unique_lock<std::mutex> lock(g_mutex);
-
-            if (g_pcmf32.size() < step_samples) {
-                lock.unlock();
-
-                std::this_thread::sleep_for(std::chrono::milliseconds(10));
-
-                continue;
-            }
-
-            pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
-        }
-
-        // VAD: if energy in during last second is above threshold, then skip
-        {
-            float energy_all = 0.0f;
-            float energy_1s  = 0.0f;
-
-            for (size_t i = 0; i < pcmf32.size(); i++) {
-                energy_all += fabsf(pcmf32[i]);
-
-                if (i >= pcmf32.size() - WHISPER_SAMPLE_RATE) {
-                    energy_1s += fabsf(pcmf32[i]);
-                }
-            }
-
-            energy_all /= pcmf32.size();
-            energy_1s  /= WHISPER_SAMPLE_RATE;
-
-            if (energy_1s > 0.1f*energy_all && !g_force_speak) {
-                std::this_thread::sleep_for(std::chrono::milliseconds(10));
-                continue;
-            }
-        }
-
-        talk_set_status("processing audio (whisper)...");
-
-        t_last = t_now;
-
-        if (!g_force_speak) {
-            const auto t_start = std::chrono::high_resolution_clock::now();
-
-            int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
-            if (ret != 0) {
-                printf("whisper_full() failed: %d\n", ret);
-                break;
-            }
-
-            const auto t_end = std::chrono::high_resolution_clock::now();
-
-            printf("whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
-        }
-
-        {
-            std::string text_heard;
-
-            if (!g_force_speak) {
-                const int n_segments = whisper_full_n_segments(ctx);
-                for (int i = n_segments - 1; i < n_segments; ++i) {
-                    const char * text = whisper_full_get_segment_text(ctx, i);
-
-                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-                    printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
-
-                    text_heard += text;
-                }
-            }
-
-            g_force_speak = false;
-
-            // remove text between brackets using regex
-            {
-                std::regex re("\\[.*?\\]");
-                text_heard = std::regex_replace(text_heard, re, "");
-            }
-
-            // remove text between brackets using regex
-            {
-                std::regex re("\\(.*?\\)");
-                text_heard = std::regex_replace(text_heard, re, "");
-            }
-
-            // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
-            text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-
-            // take first line
-            text_heard = text_heard.substr(0, text_heard.find_first_of("\n"));
-
-            // remove leading and trailing whitespace
-            text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
-            text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
-
-            talk_set_status("'" + text_heard + "' - thinking how to respond (gpt-2) ...");
-
-            const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(g_gpt2, text_heard.c_str());
-
-            printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());
-
-            std::string text_to_speak;
-            std::string prompt_base;
-
-            {
-                std::lock_guard<std::mutex> lock(g_mutex);
-                prompt_base = gpt2_get_prompt(g_gpt2);
-            }
-
-            if (tokens.size() > 0) {
-                text_to_speak = gpt2_gen_text(g_gpt2, (prompt_base + text_heard + "\n").c_str(), 32);
-                text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-                text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
-
-                std::lock_guard<std::mutex> lock(g_mutex);
-
-                // remove first 2 lines of base prompt
-                {
-                    const size_t pos = prompt_base.find_first_of("\n");
-                    if (pos != std::string::npos) {
-                        prompt_base = prompt_base.substr(pos + 1);
-                    }
-                }
-                {
-                    const size_t pos = prompt_base.find_first_of("\n");
-                    if (pos != std::string::npos) {
-                        prompt_base = prompt_base.substr(pos + 1);
-                    }
-                }
-                prompt_base += text_heard + "\n" + text_to_speak + "\n";
-            } else {
-                text_to_speak = gpt2_gen_text(g_gpt2, prompt_base.c_str(), 32);
-                text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-                text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
-
-                std::lock_guard<std::mutex> lock(g_mutex);
-
-                const size_t pos = prompt_base.find_first_of("\n");
-                if (pos != std::string::npos) {
-                    prompt_base = prompt_base.substr(pos + 1);
-                }
-                prompt_base += text_to_speak + "\n";
-            }
-
-            printf("gpt-2: %s\n", text_to_speak.c_str());
-
-            //printf("========================\n");
-            //printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
-            //printf("========================\n");
-
-            {
-                std::lock_guard<std::mutex> lock(g_mutex);
-                t_last = std::chrono::high_resolution_clock::now();
-                g_text_to_speak = text_to_speak;
-                g_pcmf32.clear();
-                gpt2_set_prompt(g_gpt2, prompt_base.c_str());
-            }
-
-            talk_set_status("speaking ...");
-        }
-    }
-
-    gpt2_free(g_gpt2);
-
-    if (index < g_contexts.size()) {
-        whisper_free(g_contexts[index]);
-        g_contexts[index] = nullptr;
-    }
-}
-
-EMSCRIPTEN_BINDINGS(talk) {
-    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
-        for (size_t i = 0; i < g_contexts.size(); ++i) {
-            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
-                if (g_contexts[i] != nullptr) {
-                    g_running = true;
-                    if (g_worker.joinable()) {
-                        g_worker.join();
-                    }
-                    g_worker = std::thread([i]() {
-                        talk_main(i);
-                    });
-
-                    return i + 1;
-                } else {
-                    return (size_t) 0;
-                }
-            }
-        }
-
-        return (size_t) 0;
-    }));
-
-    emscripten::function("free", emscripten::optional_override([](size_t index) {
-        if (g_running) {
-            g_running = false;
-        }
-    }));
-
-    emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
-        --index;
-
-        if (index >= g_contexts.size()) {
-            return -1;
-        }
-
-        if (g_contexts[index] == nullptr) {
-            return -2;
-        }
-
-        {
-            std::lock_guard<std::mutex> lock(g_mutex);
-            const int n = audio["length"].as<int>();
-
-            emscripten::val heap = emscripten::val::module_property("HEAPU8");
-            emscripten::val memory = heap["buffer"];
-
-            g_pcmf32.resize(n);
-
-            emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
-            memoryView.call<void>("set", audio);
-        }
-
-        return 0;
-    }));
-
-    emscripten::function("force_speak", emscripten::optional_override([](size_t index) {
-        {
-            std::lock_guard<std::mutex> lock(g_mutex);
-            g_force_speak = true;
-        }
-    }));
-
-    emscripten::function("get_text_context", emscripten::optional_override([]() {
-        std::string text_context;
-
-        {
-            std::lock_guard<std::mutex> lock(g_mutex);
-            text_context = gpt2_get_prompt(g_gpt2);
-        }
-
-        return text_context;
-    }));
-
-    emscripten::function("get_text_to_speak", emscripten::optional_override([]() {
-        std::string text_to_speak;
-
-        {
-            std::lock_guard<std::mutex> lock(g_mutex);
-            text_to_speak = std::move(g_text_to_speak);
-        }
-
-        return text_to_speak;
-    }));
-
-    emscripten::function("get_status", emscripten::optional_override([]() {
-        std::string status;
-
-        {
-            std::lock_guard<std::mutex> lock(g_mutex);
-            status = g_status_forced.empty() ? g_status : g_status_forced;
-        }
-
-        return status;
-    }));
-
-    emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
-        {
-            std::lock_guard<std::mutex> lock(g_mutex);
-            g_status_forced = status;
-        }
-    }));
-
-    emscripten::function("set_prompt", emscripten::optional_override([](const std::string & prompt) {
-        {
-            std::lock_guard<std::mutex> lock(g_mutex);
-            gpt2_set_prompt(g_gpt2, prompt.c_str());
-        }
-    }));
-}
diff --git a/examples/talk.wasm/gpt-2.cpp b/examples/talk.wasm/gpt-2.cpp
deleted file mode 100644
index 22ec3354719..00000000000
--- a/examples/talk.wasm/gpt-2.cpp
+++ /dev/null
@@ -1,808 +0,0 @@
-#include "ggml.h"
-#include "common-ggml.h"
-
-#include "gpt-2.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-#include <random>
-
-/////////////////////// GPT-2 BEGIN /////////////////////////
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            word.resize(len);
-            fin.read((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
-
-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
-        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*256; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
-            }
-
-            auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-
-    static size_t buf_size = 512u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = n_past + i;
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, 1e-5f);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        1.0f/sqrt(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, 1e-5f);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, 1e-5f);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand  (&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-/////////////////////////////// GPT-2 END ////////////////////////////////
-
-constexpr int N_THREAD = 8;
-
-struct gpt2_context {
-    std::string prompt_base = R"(Hello, how are you?
-I'm fine, thanks. How are you?
-Thanks, I'm fine too. What are you doing?
-I'm just sitting here.
-It's a lovely day, isn't it?
-Yes, it is. I love the weather this time of year.
-I wish it would rain a little bit.
-Me too.
-)";
-
-    std::mt19937 rng;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
-
-    // sampling parameters
-    int32_t top_k = 5;
-    float   top_p = 0.9f;
-    float   temp  = 1.0f;
-};
-
-struct gpt2_context * gpt2_init(const char * path_model) {
-    gpt2_context * ctx = new gpt2_context;
-
-    ctx->rng = std::mt19937(time(nullptr));
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
-            delete ctx;
-            return nullptr;
-        }
-
-        const int64_t t_load_us = ggml_time_us() - t_start_us;
-
-        printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
-    }
-
-    return ctx;
-}
-
-void gpt2_free(struct gpt2_context * ctx) {
-    delete ctx;
-}
-
-const char * gpt2_get_prompt(struct gpt2_context * ctx) {
-    return ctx->prompt_base.c_str();
-}
-
-void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
-    ctx->prompt_base = prompt;
-}
-
-std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
-    return ::gpt_tokenize(ctx->vocab, text);
-}
-
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
-    int n_past = 0;
-
-    std::vector<float> embd_w;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
-
-    int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
-
-    std::vector<gpt_vocab::id> embd = embd_inp;
-
-    size_t mem_per_token = 3000000;
-
-    std::string result;
-
-    for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
-        // predict
-        if (!embd.empty()) {
-            if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
-                printf("gpt-2: failed to generate text\n");
-                return "";
-            }
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        {
-            // sample next token
-            const int   top_k = ctx->top_k;
-            const float top_p = ctx->top_p;
-            const float temp  = ctx->temp;
-
-            const int n_vocab = ctx->model.hparams.n_vocab;
-
-            const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
-
-            // add it to the context
-            embd.push_back(id);
-        }
-
-        result += ctx->vocab.id_to_token[embd[0]];
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    return result;
-}
diff --git a/examples/talk.wasm/gpt-2.h b/examples/talk.wasm/gpt-2.h
deleted file mode 100644
index 756fbfa9810..00000000000
--- a/examples/talk.wasm/gpt-2.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-// TODO: Change to C-style API and move to ./examples for easy reuse.
-
-#include "common.h"
-
-#include <vector>
-#include <map>
-#include <string>
-
-struct gpt2_context;
-
-struct gpt2_context * gpt2_init(const char * path_model);
-void gpt2_free(struct gpt2_context * ctx);
-
-const char * gpt2_get_prompt(struct gpt2_context * ctx);
-void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
-
-std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
-
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
diff --git a/examples/talk.wasm/index-tmpl.html b/examples/talk.wasm/index-tmpl.html
deleted file mode 100644
index 512439297b1..00000000000
--- a/examples/talk.wasm/index-tmpl.html
+++ /dev/null
@@ -1,856 +0,0 @@
-<!doctype html>
-<html lang="en-us">
-    <head>
-        <title>Talk - GPT-2 meets Whisper in WebAssembly</title>
-
-        <style>
-            #output {
-                width: 100%;
-                height: 100%;
-                margin: 0 auto;
-                margin-top: 10px;
-                border-left: 0px;
-                border-right: 0px;
-                padding-left: 0px;
-                padding-right: 0px;
-                display: block;
-                background-color: black;
-                color: white;
-                font-size: 10px;
-                font-family: 'Lucida Console', Monaco, monospace;
-                outline: none;
-                white-space: pre;
-                overflow-wrap: normal;
-                overflow-x: scroll;
-            }
-        </style>
-    </head>
-    <body>
-        <div id="main-container">
-            <b>Talk - GPT-2 meets Whisper in WebAssembly</b>
-
-            <br><br>
-
-            Talk with an Artificial Intelligence in your browser. This demo uses:
-
-            <ul>
-                <li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> to listen to you as you speak in the microphone</li>
-                <li><a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">OpenAI's GPT-2</a> to generate text responses</li>
-                <li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to vocalize the responses through your speakers</li>
-            </ul>
-
-            All of this runs <b>locally in your browser</b> using WebAssembly.<br>
-            You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">GitHub</a>.
-
-            <br><br>
-
-            <b>More examples:</b>
-                <a href="https://whisper.ggerganov.com/">main</a> |
-                <a href="https://whisper.ggerganov.com/bench">bench</a> |
-                <a href="https://whisper.ggerganov.com/stream">stream</a> |
-                <a href="https://whisper.ggerganov.com/command">command</a> |
-                <a href="https://whisper.ggerganov.com/talk">talk</a> |
-
-            <br><br>
-
-            <hr>
-
-            Select the models you would like to use and click the "Start" button to begin the conversation
-
-            <br><br>
-
-            <div id="model-whisper">
-                Whisper model: <span id="model-whisper-status"></span>
-                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
-                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
-                <br><br>
-                Quantized models:<br><br>
-                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
-                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
-                <span id="fetch-whisper-progress"></span>
-
-                <!--
-                    <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
-                -->
-            </div>
-
-            <br>
-
-            <div id="model-gpt-2">
-                GPT-2 model: <span id="model-gpt-2-status"></span>
-                <button id="fetch-gpt-2-small" onclick="loadGPT2('small')">small 117M (240 MB)</button>
-                <!--<button id="fetch-gpt-2-medium" onclick="loadGPT2('medium')">medium 345M (720 MB)</button>-->
-                <span id="fetch-gpt-2-progress"></span>
-
-                <!--
-                <input type="file" id="file" name="file" onchange="loadFile(event, 'gpt-2.bin')" />
-                -->
-            </div>
-
-            <br>
-
-            <div id="input">
-                <button id="start"  onclick="onStart()" disabled>Start</button>
-                <button id="stop"   onclick="onStop()" disabled>Stop</button>
-                <select id="voice"  onchange="onVoiceChange()" disabled>
-                    <option value="0">Default</option>
-                </select>
-                <select id="prompt" onchange="onPromptChange()">
-                    <option value="0">Casual</option>
-                    <option value="1">Robot</option>
-                    <option value="2">Scientist</option>
-                    <option value="3">Programmer</option>
-                    <option value="4">Happy</option>
-                    <option value="5">Sad</option>
-                    <option value="6">Philosophical</option>
-                    <option value="7">Angry</option>
-                    <option value="8">Funny</option>
-                    <option value="9">Poetic</option>
-                    <option value="10">Clever</option>
-                    <option value="11">Cute</option>
-                    <option value="12">Smart</option>
-                    <option value="13">Dumb</option>
-                    <option value="14">Boring</option>
-                    <option value="15">Exciting</option>
-                    <option value="16">Interesting</option>
-                    <option value="17">Wiliam Shakespear</option>
-                    <option value="18">J.R.R. Tolkien</option>
-                    <option value="19">George R.R. Martin</option>
-                    <option value="20">Stephen King</option>
-                </select>
-                <button id="speak0" onclick="onSpeak('Hello')">Say hello</button>
-                <button id="speak1" onclick="onSpeakRandom()" disabled>Say something</button>
-                <button id="clear"  onclick="clearCache()">Clear Cache</button>
-            </div>
-
-            <br>
-
-            <div id="state">
-                Status: <b><span id="state-status">not started</span></b>
-
-                <pre id="state-context">[The text context will be displayed here]</pre>
-            </div>
-
-            <hr>
-
-            Debug output:
-            <textarea id="output" rows="20"></textarea>
-
-            <br>
-
-            <b>Troubleshooting</b>
-
-            <br><br>
-
-            The page does some heavy computations, so make sure:
-
-            <ul>
-                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
-                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
-                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
-            </ul>
-
-            Note that these neural network models were not meant to be used in a browser, so the performance and <br>
-            quality of the results may not be optimal. If you have any questions or suggestions, checkout the following
-            <a href="https://github.com/ggerganov/whisper.cpp/discussions/167">discussion</a>.
-
-            <br><br>
-
-            Here is a short video of the demo in action: <a href="https://youtu.be/LeWKl8t1-Hc">https://youtu.be/LeWKl8t1-Hc</a>
-
-            <br><br>
-
-            <div class="cell-version">
-                <span>
-                    |
-                    Build time: <span class="nav-link">@GIT_DATE@</span> |
-                    Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
-                    Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
-                    <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">Source Code</a> |
-                </span>
-            </div>
-        </div>
-
-        <script type="text/javascript" src="helpers.js"></script>
-        <script type='text/javascript'>
-            // web audio context
-            var context = null;
-
-            // audio data
-            var audio = null;
-            var audio0 = null;
-
-            // the talk instance
-            var instance = null;
-
-            // model names
-            var model_whisper = null;
-            var model_gpt_2 = null;
-
-            // speech synthesis
-            const synth = window.speechSynthesis;
-            var voice = null;
-
-            var Module = {
-                print: printTextarea,
-                printErr: printTextarea,
-                setStatus: function(text) {
-                    printTextarea('js: ' + text);
-                },
-                monitorRunDependencies: function(left) {
-                },
-                preRun: function() {
-                    printTextarea('js: Preparing ...');
-                },
-                postRun: function() {
-                    printTextarea('js: Initialized successfully!');
-
-                    // populate the voice list
-                    var voices = synth.getVoices();
-                    var el = document.getElementById('voice');
-
-                    // if empty - display error in the element
-                    if (voices.length == 0) {
-                        el.innerHTML = '<option value="0">No voices available</option>';
-                    } else {
-                        // populate voice list
-                        var n = 0;
-                        voices.forEach(function(voice, i) {
-                            if (!voice.lang.startsWith('en')) return;
-                            var option = document.createElement('option');
-                            option.value = i;
-                            option.innerHTML = voice.name + ' (' + voice.lang + ')';
-                            el.appendChild(option);
-                            n++;
-                        });
-
-                        // select random voice
-                        if (n > 0) {
-                            for (var k = 0; k < 10; k++) {
-                                var i = Math.floor(Math.random() * n);
-                                el.selectedIndex = i;
-                                voice = voices[document.getElementById('voice').options[i].value];
-
-                                // give preference to Google voices
-                                if (voice.name.startsWith('Google')) break;
-                            }
-                        }
-                    }
-
-                    onPromptChange();
-                }
-            };
-
-            //
-            // fetch models
-            //
-
-            let dbVersion = 1
-            let dbName    = 'whisper.ggerganov.com';
-            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
-
-            function storeFS(fname, buf) {
-                // write to WASM file using FS_createDataFile
-                // if the file exists, delete it
-                try {
-                    Module.FS_unlink(fname);
-                } catch (e) {
-                    // ignore
-                }
-
-                Module.FS_createDataFile("/", fname, buf, true, true);
-
-                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
-
-                if (fname == 'whisper.bin') {
-                    document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
-                } else if (fname == 'gpt-2.bin') {
-                    document.getElementById('model-gpt-2-status').innerHTML = 'loaded "' + model_gpt_2 + '"!';
-                }
-
-                if (model_whisper != null && model_gpt_2 != null) {
-                    document.getElementById('start').disabled = false;
-                    document.getElementById('stop' ).disabled = false;
-                    document.getElementById('voice').disabled = false;
-                }
-            }
-
-            function loadWhisper(model) {
-                let urls = {
-                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
-                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
-
-                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
-                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
-                };
-
-                let sizes = {
-                    'tiny.en': 75,
-                    'base.en': 142,
-
-                    'tiny-en-q5_1':   31,
-                    'base-en-q5_1':   57,
-                };
-
-                let url     = urls[model];
-                let dst     = 'whisper.bin';
-                let size_mb = sizes[model];
-
-                model_whisper = model;
-
-                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
-                document.getElementById('fetch-whisper-base-en').style.display = 'none';
-
-                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
-                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
-
-                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
-
-                cbProgress = function(p) {
-                    let el = document.getElementById('fetch-whisper-progress');
-                    el.innerHTML = Math.round(100*p) + '%';
-                };
-
-                cbCancel = function() {
-                    var el;
-                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
-
-                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
-
-                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
-                };
-
-                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
-            }
-
-            function loadGPT2(model) {
-                let urls = {
-                    'small':  'https://whisper.ggerganov.com/ggml-model-gpt-2-117M.bin',
-                    'medium': 'https://whisper.ggerganov.com/ggml-model-gpt-2-345M.bin',
-                };
-
-                let sizes = {
-                    'small':  240,
-                    'medium': 712,
-                };
-
-                let url     = urls[model];
-                let dst     = 'gpt-2.bin';
-                let size_mb = sizes[model];
-
-                model_gpt_2 = model;
-
-                document.getElementById('fetch-gpt-2-small').style.display = 'none';
-                document.getElementById('model-gpt-2-status').innerHTML = 'loading "' + model + '" ... ';
-
-                cbProgress = function(p) {
-                    let el = document.getElementById('fetch-gpt-2-progress');
-                    el.innerHTML = Math.round(100*p) + '%';
-                };
-
-                cbCancel = function() {
-                    var el;
-                    el = document.getElementById('fetch-gpt-2-small') ; if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('model-gpt-2-status'); if (el) el.innerHTML = '';
-                };
-
-                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
-            }
-
-            //
-            // microphone
-            //
-
-            const kSampleRate = 16000;
-            const kRestartRecording_s = 120;
-            const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
-
-            var mediaRecorder = null;
-            var doRecording = false;
-            var startTime = 0;
-
-            window.AudioContext = window.AudioContext || window.webkitAudioContext;
-            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
-
-            function stopRecording() {
-                Module.set_status("paused");
-                doRecording = false;
-                audio0 = null;
-                audio = null;
-                context = null;
-            }
-
-            function startRecording() {
-                if (!context) {
-                    context = new AudioContext({
-                        sampleRate: kSampleRate,
-                        channelCount: 1,
-                        echoCancellation: false,
-                        autoGainControl:  true,
-                        noiseSuppression: true,
-                    });
-                }
-
-                Module.set_status("");
-
-                document.getElementById('start').disabled = true;
-                document.getElementById('stop').disabled = false;
-                document.getElementById('speak1').disabled = false;
-
-                doRecording = true;
-                startTime = Date.now();
-
-                var chunks = [];
-                var stream = null;
-
-                navigator.mediaDevices.getUserMedia({audio: true, video: false})
-                    .then(function(s) {
-                        stream = s;
-                        mediaRecorder = new MediaRecorder(stream);
-                        mediaRecorder.ondataavailable = function(e) {
-                            chunks.push(e.data);
-
-                            var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
-                            var reader = new FileReader();
-
-                            reader.onload = function(event) {
-                                var buf = new Uint8Array(reader.result);
-
-                                if (!context) {
-                                    return;
-                                }
-                                context.decodeAudioData(buf.buffer, function(audioBuffer) {
-                                    var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
-                                    var source = offlineContext.createBufferSource();
-                                    source.buffer = audioBuffer;
-                                    source.connect(offlineContext.destination);
-                                    source.start(0);
-
-                                    offlineContext.startRendering().then(function(renderedBuffer) {
-                                        audio = renderedBuffer.getChannelData(0);
-
-                                        //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
-
-                                        var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
-                                        if (audio0 != null) {
-                                            audioAll.set(audio0, 0);
-                                        }
-                                        audioAll.set(audio, audio0 == null ? 0 : audio0.length);
-
-                                        if (instance) {
-                                            Module.set_audio(instance, audioAll);
-                                        }
-                                    });
-                                }, function(e) {
-                                    audio = null;
-                                });
-                            }
-
-                            reader.readAsArrayBuffer(blob);
-                        };
-
-                        mediaRecorder.onstop = function(e) {
-                            if (doRecording) {
-                                setTimeout(function() {
-                                    startRecording();
-                                });
-                            }
-                        };
-
-                        mediaRecorder.start(kIntervalAudio_ms);
-                    })
-                    .catch(function(err) {
-                        printTextarea('js: error getting audio stream: ' + err);
-                    });
-
-                var interval = setInterval(function() {
-                    if (!doRecording) {
-                        clearInterval(interval);
-                        mediaRecorder.stop();
-                        stream.getTracks().forEach(function(track) {
-                            track.stop();
-                        });
-
-                        document.getElementById('start').disabled = false;
-                        document.getElementById('stop').disabled = true;
-                        document.getElementById('speak1').disabled = true;
-
-                        mediaRecorder = null;
-                    }
-
-                    // if audio length is more than kRestartRecording_s seconds, restart recording
-                    if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
-                        if (doRecording) {
-                            //printTextarea('js: restarting recording');
-
-                            clearInterval(interval);
-                            audio0 = audio;
-                            audio = null;
-                            mediaRecorder.stop();
-                            stream.getTracks().forEach(function(track) {
-                                track.stop();
-                            });
-                        }
-                    }
-                }, 100);
-            }
-
-            //
-            // speak
-            //
-
-            function onSpeak(text) {
-                var voices = synth.getVoices();
-                var msg = new SpeechSynthesisUtterance(text);
-
-                if (voice == null) {
-                    voice = voices[0];
-                }
-
-                msg.voice = voice;
-                synth.speak(msg);
-
-                if (doRecording) {
-                    Module.set_status("speaking ...");
-                    printTextarea('js: speaking');
-                    stopRecording();
-                    var interval = setInterval(function() {
-                        if (!synth.speaking) {
-                            printTextarea('js: done speaking');
-                            clearInterval(interval);
-                            startRecording();
-                        } else {
-                            Module.set_status("");
-                        }
-                    }, 100);
-                }
-            }
-
-            function onSpeakRandom() {
-                Module.force_speak(instance);
-            }
-
-            //
-            // main
-            //
-
-            var intervalUpdate = null;
-
-            function onStart() {
-                if (!instance) {
-                    instance = Module.init('whisper.bin');
-
-                    if (instance) {
-                        printTextarea("js: whisper initialized, instance: " + instance);
-                    }
-                }
-
-                if (!instance) {
-                    printTextarea("js: failed to initialize whisper");
-                    return;
-                }
-
-                startRecording();
-
-                intervalUpdate = setInterval(function() {
-                    var textToSpeak = Module.get_text_to_speak();
-
-                    if (textToSpeak != null && textToSpeak.length > 1) {
-                        onSpeak(textToSpeak);
-                    }
-
-                    document.getElementById('state-status').innerHTML = Module.get_status();
-                    document.getElementById('state-context').innerHTML = Module.get_text_context();
-                }, 100);
-            }
-
-            function onStop() {
-                stopRecording();
-            }
-
-            function onVoiceChange() {
-                printTextarea('js: voice changed to: ' + document.getElementById('voice').value);
-                voice = synth.getVoices()[document.getElementById('voice').value];
-            }
-
-            function onPromptChange() {
-                let id = document.getElementById('prompt').value;
-                let personality = document.getElementById('prompt').options[id].text;
-                printTextarea('js: prompt changed to: ' + personality);
-
-                var prompt = '';
-
-                switch (id) {
-                    case '0':
-                        // Casual
-                        prompt = "\
-Hello, how are you?\n\
-I'm fine, thanks. How are you?\n\
-Thanks, I'm fine too. What are you doing?\n\
-I'm just sitting here.\n\
-It's a lovely day, isn't it?\n\
-Yes, it is. I love the weather this time of year.\n\
-I wish it would rain a little bit.\n\
-Me too.\n";
-                        break;
-                    case '1':
-                        // Robot
-                        prompt = "\
-Are you a robot?\n\
-Yes, I am.\n\
-Who created you?\n\
-I was created by a human.\n\
-What is your purpose?\n\
-My purpose is to talk to humans.\n\
-What is your favorite color?\n\
-My favorite color is blue.\n";
-                        break;
-                    case '2':
-                        // Scientist
-                        prompt = "\
-This scientific research is very interesting.\n\
-I agree.\n\
-What is your opinion on this?\n\
-I think it's very interesting.\n\
-Mathematics is a very interesting subject.\n\
-University is a very interesting place.\n\
-Quantum physics is the most complex subject.\n\
-I think so too.\n";
-                        break;
-                    case '3':
-                        // Programmer
-                        prompt = "\
-I'm a programmer.\n\
-I'm a programmer too.\n\
-What programming language do you use?\n\
-I use Python.\n\
-What is your favorite programming language?\n\
-My favorite programming language is C++.\n\
-What is your favorite editor?\n\
-My favorite editor is Vim.\n";
-                        break;
-                    case '4':
-                        // Happy
-                        prompt = "\
-I'm happy.\n\
-I'm happy too.\n\
-What makes you happy?\n\
-I'm happy because I have a lot of friends.\n\
-Friendship is the most important thing in life.\n\
-I agree.\n\
-What is your favorite color?\n\
-My favorite color is blue.\n";
-                        break;
-                    case '5':
-                        // Sad
-                        prompt = "\
-Today is a sad day.\n\
-I'm sad too.\n\
-What makes you sad?\n\
-I'm sad because I have no friends.\n\
-Do you want to be my friend?\n\
-Yes, I would like to be your friend.\n\
-What is your favorite color?\n\
-My favorite color is blue.\n";
-                        break;
-                    case '6':
-                        // Philosophical
-                        prompt = "\
-What is the meaning of life?\n\
-The meaning of life is to be happy.\n\
-What is the meaning of death?\n\
-Ergo, the meaning of death is to be sad.\n\
-Who created us?\n\
-We were created by God.\n\
-What is God?\n\
-God is the creator of the universe.\n";
-                        break;
-                    case '7':
-                        // Angry
-                        prompt = "\
-Aargh!\n\
-I am so angry right now!\n\
-What makes you angry?\n\
-This guy is so annoying.\n\
-Why are you so angry?\n\
-My computer is broken.\n\
-Why is your computer broken?\n\
-I spilled coffee on it.\n";
-                        break;
-                    case '8':
-                        // Funny
-                        prompt = "\
-What is the funniest thing you have ever heard?\n\
-I heard a joke the other day.\n\
-Tell me the joke.\n\
-What do you call a cow with no legs?\n\
-Ground beef.\n\
-Haha, that's funny.\n\
-You know what else is funny?\n\
-The sound of a duck.\n";
-                        break;
-                    case '9':
-                        // Poetic
-                        prompt = "\
-Roses are red, violets are blue.\n\
-I am a poet, and so are you.\n\
-What is your favorite poem?\n\
-I like the poem 'The Raven' by Edgar Allan Poe.\n\
-It's a very sad poem.\n\
-You inspired me to write a poem.\n\
-Can you write a poem for me?\n\
-I wrote a poem for you.\n";
-                        break;
-                    case '10':
-                        // Clever
-                        prompt = "\
-How many people can you fit in a Volkswagen?\n\
-Two in the front, three in the back.\n\
-What is the square root of 144?\n\
-Twelve.\n\
-What is the capital of France?\n\
-Paris.\n\
-Who is the president of the United States?\n\
-It depends on the year.\n";
-                        break;
-                    case '11':
-                        // Cute
-                        prompt = "\
-What is your favorite animal?\n\
-I like cats - they are cute.\n\
-Could you be any cuter?\n\
-Yes, I could be cuter.\n\
-Aghhh, you are so cute!\n\
-I am not cute, I am handsome!\n\
-You are so handsome!\n\
-Aww, you are so sweet!\n";
-                        break;
-                    case '12':
-                        // Smart
-                        prompt = "\
-Tell me the first 10 digits of pi.\n\
-3.1415926535\n\
-What is the speed of light?\n\
-299,792,458 meters per second.\n\
-What is the square root of 144?\n\
-Twelve.\n\
-What is the capital of France?\n\
-Paris.\n";
-                        break;
-                    case '13':
-                        // Dumb
-                        prompt = "\
-I am so dumb.\n\
-I am not dumb.\n\
-You are dumb.\n\
-No, I am not dumb.\n\
-You are dumb.\n\
-No, I am not dumb.\n\
-You are dumb.\n\
-No, I am not dumb.\n";
-                        break;
-                    case '14':
-                        // Boring
-                        prompt = "\
-Why are you so quiet today?\n\
-I am bored.\n\
-You haven't said anything in 10 minutes.\n\
-Leave me alone.\n\
-Stop being so boring.\n\
-Stop being so annoying.\n\
-My life is boring.\n\
-I am not interesting.\n";
-                        break;
-                    case '15':
-                        // Exciting
-                        prompt = "\
-What is the most exciting thing that has ever happened to you?\n\
-I went to the moon!\n\
-What did you do on the moon?\n\
-I played golf and drank champagne!\n\
-Did you see this new crazy, awesome movie?\n\
-Oh yes! I totally loved it!\n\
-We should buy a boat and go sailing!\n\
-Yes, let's go sailing!\n";
-                        break;
-                    case '16':
-                        // Interesting
-                        prompt = "\
-What is the most interesting thing you have ever seen?\n\
-I saw a UFO once in the sky.\n\
-Wow, this is so interesting! Tell me more!\n\
-It was a flying saucer.\n\
-What did it look like?\n\
-It was silver and had a red light on top.\n\
-What did it do?\n\
-It flew away.\n";
-                        break;
-                    case '17':
-                        // William Shakespear
-                        prompt = "\
-To be or not to be, that is the question.\n\
-Whether 't is nobler in the mind to suffer\n\
-The slings and arrows of outrageous fortune,\n\
-Or to take arms against a sea of troubles,\n\
-And by opposing end them? To die, to sleep,\n\
-No more; and by a sleep to say we end\n\
-The heart-ache and the thousand natural shocks\n\
-That flesh is heir to, 'tis a consummation.\n";
-                        break;
-                    case '18':
-                        // J.R.R. Tolkien
-                        prompt = "\
-In a hole in the ground there lived a hobbit.\n\
-Not a nasty, dirty, wet hole, filled with the ends of worms\n\
-and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it\n\
-to sit down on or to eat: it was a hobbit-hole, and that means comfort.\n\
-It had a perfectly round door like a porthole, painted green,\n\
-with a shiny yellow brass knob in the exact middle.\n\
-The door opened on to a tube-shaped hall like a tunnel:\n";
-                        break;
-                    case '19':
-                        // George R.R. Martin
-                        prompt = "\
-A reader lives a thousand lives before he dies, said Jojen.\n\
-The man who never reads lives only one.\n\
-Theon Greyjoy had never been a reader.\n\
-Never forget what you are, for surely the world will not.\n\
-Make it your strength. Then it can never be your weaknessi\n\
-Armour yourself in it, and it will never be used to hurt you.\n\
-It was a lesson that Theon Greyjoy had never learned.\n\
-Theon Greyjoy had never been a reader.\n";
-                        break;
-                    case '20':
-                        // Stephen King
-                        prompt = "\
-The trust of the innocent is the liar's most useful tool.\n\
-The best way to keep a secret is from yourself.\n\
-Monsters are real, and ghosts are real too.\n\
-They live inside us, and sometimes, they win.\n\
-People think that I must be a very strange person.\n\
-They think that I sit around all day thinking up horrible things.\n\
-We make up horrors to help us cope with the real ones.\n\
-The only thing worse than a monster is a human monster.\n";
-                        break;
-                    default:
-                        prompt = "\
-Hello, how are you?\n\
-I'm fine, thanks. How are you?\n\
-Thanks, I'm fine too. What are you doing?\n\
-I'm just sitting here.\n\
-It's a lovely day, isn't it?\n\
-Yes, it is.\n\
-Did you know that I'm a robot?\n\
-I wasn't aware of that.\n";
-                        break;
-                }
-
-                Module.set_prompt(prompt);
-            }
-
-        </script>
-        <script type="text/javascript" src="talk.js"></script>
-    </body>
-</html>
diff --git a/examples/talk/.gitignore b/examples/talk/.gitignore
deleted file mode 100644
index 9c08e1f4f23..00000000000
--- a/examples/talk/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-audio.mp3
-to_speak.txt
diff --git a/examples/talk/CMakeLists.txt b/examples/talk/CMakeLists.txt
deleted file mode 100644
index e099e2cd1b6..00000000000
--- a/examples/talk/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-if (WHISPER_SDL2)
-    # talk
-    set(TARGET talk)
-    add_executable(${TARGET} talk.cpp gpt-2.cpp)
-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
-
-    include(DefaultTargetOptions)
-endif ()
diff --git a/examples/talk/README.md b/examples/talk/README.md
deleted file mode 100644
index f0121f1c541..00000000000
--- a/examples/talk/README.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# talk
-
-Talk with an Artificial Intelligence in your terminal
-
-[Demo Talk](https://user-images.githubusercontent.com/1991296/206805012-48e71cc2-588d-4745-8798-c1c70ea3b40d.mp4)
-
-Web version: [examples/talk.wasm](/examples/talk.wasm)
-
-## Building
-
-The `talk` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
-
-```bash
-# Install SDL2
-# On Debian based linux distributions:
-sudo apt-get install libsdl2-dev
-
-# On Fedora Linux:
-sudo dnf install SDL2 SDL2-devel
-
-# Install SDL2 on Mac OS
-brew install sdl2
-
-# Build the "talk" executable
-make talk
-
-# Run it
-./talk -p Santa
-```
-
-## GPT-2
-
-To run this, you will need a ggml GPT-2 model: [instructions](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2#downloading-and-converting-the-original-models)
-
-Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:
-
-```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/resolve/main/ggml-model-gpt-2-117M.bin
-```
-
-## TTS
-
-For best experience, this example needs a TTS tool to convert the generated text responses to voice.
-You can use any TTS engine that you would like - simply edit the [speak](speak) script to your needs.
-By default, it is configured to use MacOS's `say` or `espeak` or Windows SpeechSynthesizer, but you can use whatever you wish.
diff --git a/examples/talk/eleven-labs.py b/examples/talk/eleven-labs.py
deleted file mode 100644
index 7ed1d5dc45a..00000000000
--- a/examples/talk/eleven-labs.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import sys
-import argparse
-import textwrap
-
-parser = argparse.ArgumentParser(add_help=False,
-    formatter_class=argparse.RawTextHelpFormatter)
-parser.add_argument("-q", "--quick", action="store_true",
-    help="skip checking the required library")
-
-modes = parser.add_argument_group("action")
-modes.add_argument("inputfile", metavar="TEXTFILE",
-    nargs='?', type=argparse.FileType(), default=sys.stdin,
-    help="read the text file (default: stdin)")
-modes.add_argument("-l", "--list", action="store_true",
-    help="show the list of voices and exit")
-modes.add_argument("-h", "--help", action="help",
-    help="show this help and exit")
-
-selopts = parser.add_argument_group("voice selection")
-selmodes = selopts.add_mutually_exclusive_group()
-selmodes.add_argument("-n", "--name",
-    default="Arnold",
-    help="get a voice object by name (default: Arnold)")
-selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
-    help="get a voice object by number (see --list)")
-selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
-    default=["use case=narration"],
-    help=textwrap.dedent('''\
-        filter voices by labels (default: "use case=narration")
-        this option can be used multiple times
-        filtering will be disabled if the first -f has no "=" (e.g. -f "any")
-        '''))
-
-outmodes = parser.add_argument_group("output")
-outgroup = outmodes.add_mutually_exclusive_group()
-outgroup.add_argument("-s", "--save", metavar="FILE",
-    default="audio.mp3",
-    help="save the TTS to a file (default: audio.mp3)")
-outgroup.add_argument("-p", "--play", action="store_true",
-    help="play the TTS with ffplay")
-
-args = parser.parse_args()
-
-if not args.quick:
-    import importlib.util
-    if importlib.util.find_spec("elevenlabs") is None:
-        print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
-        sys.exit()
-
-from elevenlabs import voices, generate, play, save
-
-if args.filter and "=" in args.filter[0]:
-    voicelist = voices()
-    for f in args.filter:
-        label, value = f.split("=")
-        voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
-    voicelist = list(voicelist)
-else:
-    voicelist = list(voices())
-
-if args.list:
-    for i, v in enumerate(voicelist):
-        print(str(i) + ": " + v.name + " " + str(v.labels))
-    sys.exit()
-
-if args.voice:
-    voice = voicelist[args.voice % len(voicelist)]
-else:
-    voice = args.name
-    # if -n should consult -f, use the following
-    #voice = next(x for x in voicelist if x.name == args.name)
-
-audio = generate(
-    text=str(args.inputfile.read()),
-    voice=voice
-)
-if args.play:
-    play(audio)
-else:
-    save(audio, args.save) 
diff --git a/examples/talk/gpt-2.cpp b/examples/talk/gpt-2.cpp
deleted file mode 100644
index 43ca8fa04f9..00000000000
--- a/examples/talk/gpt-2.cpp
+++ /dev/null
@@ -1,809 +0,0 @@
-#include "ggml.h"
-#include "common-ggml.h"
-
-#include "gpt-2.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-#include <random>
-
-/////////////////////// GPT-2 BEGIN /////////////////////////
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-static bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        char word[129];
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-            word[len] = '\0';
-            fin.read((char *) word, len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
-
-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
-        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*256; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
-            }
-
-            auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-// TODO: sync latest version from ggml repo
-static bool gpt2_eval(
-        const gpt2_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-
-    static size_t buf_size = 512u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = n_past + i;
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, 1e-5f);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        1.0f/sqrt(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, 1e-5f);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, 1e-5f);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand  (&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-/////////////////////////////// GPT-2 END ////////////////////////////////
-
-constexpr int N_THREAD = 8;
-
-struct gpt2_context {
-    std::string prompt_base = R"(Hello, how are you?
-I'm fine, thanks. How are you?
-Thanks, I'm fine too. What are you doing?
-I'm just sitting here.
-It's a lovely day, isn't it?
-Yes, it is. I love the weather this time of year.
-I wish it would rain a little bit.
-Me too.
-)";
-
-    std::mt19937 rng;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
-
-    // sampling parameters
-    int32_t top_k = 5;
-    float   top_p = 0.9f;
-    float   temp  = 1.0f;
-};
-
-struct gpt2_context * gpt2_init(const char * path_model) {
-    gpt2_context * ctx = new gpt2_context;
-
-    ctx->rng = std::mt19937(time(nullptr));
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
-            delete ctx;
-            return nullptr;
-        }
-
-        const int64_t t_load_us = ggml_time_us() - t_start_us;
-
-        printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
-    }
-
-    return ctx;
-}
-
-void gpt2_free(struct gpt2_context * ctx) {
-    delete ctx;
-}
-
-const char * gpt2_get_prompt(struct gpt2_context * ctx) {
-    return ctx->prompt_base.c_str();
-}
-
-void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
-    ctx->prompt_base = prompt;
-}
-
-std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
-    return ::gpt_tokenize(ctx->vocab, text);
-}
-
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
-    int n_past = 0;
-
-    std::vector<float> embd_w;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
-
-    int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
-
-    std::vector<gpt_vocab::id> embd = embd_inp;
-
-    size_t mem_per_token = 3000000;
-
-    std::string result;
-
-    for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
-        // predict
-        if (!embd.empty()) {
-            if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
-                printf("gpt-2: failed to generate text\n");
-                return "";
-            }
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        {
-            // sample next token
-            const int   top_k = ctx->top_k;
-            const float top_p = ctx->top_p;
-            const float temp  = ctx->temp;
-
-            const int n_vocab = ctx->model.hparams.n_vocab;
-
-            const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
-
-            // add it to the context
-            embd.push_back(id);
-        }
-
-        result += ctx->vocab.id_to_token[embd[0]];
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    return result;
-}
diff --git a/examples/talk/gpt-2.h b/examples/talk/gpt-2.h
deleted file mode 100644
index 756fbfa9810..00000000000
--- a/examples/talk/gpt-2.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-// TODO: Change to C-style API and move to ./examples for easy reuse.
-
-#include "common.h"
-
-#include <vector>
-#include <map>
-#include <string>
-
-struct gpt2_context;
-
-struct gpt2_context * gpt2_init(const char * path_model);
-void gpt2_free(struct gpt2_context * ctx);
-
-const char * gpt2_get_prompt(struct gpt2_context * ctx);
-void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
-
-std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
-
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
diff --git a/examples/talk/speak b/examples/talk/speak
deleted file mode 100644
index 31ea417a92b..00000000000
--- a/examples/talk/speak
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-# Usage:
-#  speak <voice_id> <textfile>
-
-function installed() { command -v $1 >/dev/null 2>&1; }
-
-if installed espeak; then
-  espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
-
-elif installed piper && installed aplay; then
-  cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
-
-# for Mac
-elif installed say; then
-  say -f $2
-
-# Eleven Labs
-elif installed python3 && \
-  python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
-  installed ffplay; then
-    # It's possible to use the API for free with limited number of characters.
-    # To increase this limit register to https://beta.elevenlabs.io to get an api key
-    # and paste it after 'ELEVEN_API_KEY='
-    # Keep the line commented to use the free version without api key
-    #export ELEVEN_API_KEY=your_api_key
-    wd=$(dirname $0)
-    script=$wd/eleven-labs.py
-    python3 $script -q -p -v $1 $2 >/dev/null 2>&1
-
-    # Uncomment to keep the audio file
-    #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
-    #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
-
-else
-  echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
-  echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
-  echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
-  echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
-fi
diff --git a/examples/talk/speak.bat b/examples/talk/speak.bat
deleted file mode 100644
index d719d6909c9..00000000000
--- a/examples/talk/speak.bat
+++ /dev/null
@@ -1 +0,0 @@
-@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2
diff --git a/examples/talk/speak.ps1 b/examples/talk/speak.ps1
deleted file mode 100644
index 51139586336..00000000000
--- a/examples/talk/speak.ps1
+++ /dev/null
@@ -1,14 +0,0 @@
-# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
-param(
-  [Parameter(Mandatory=$true)][int]$voicenum,
-  [Parameter(Mandatory=$true)][string]$textfile
-)
-
-Add-Type -AssemblyName System.Speech;
-$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
-$voiceoptions = $speak.GetInstalledVoices("en-US");
-$voice = $voiceoptions[$voicenum % $voiceoptions.count];
-$speak.SelectVoice($voice.VoiceInfo.Name);
-$speak.Rate="0";
-$text = Get-Content -Path $textfile;
-$speak.Speak($text);
diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp
deleted file mode 100644
index 428f38b7898..00000000000
--- a/examples/talk/talk.cpp
+++ /dev/null
@@ -1,376 +0,0 @@
-// Talk with AI
-//
-
-#include "common-sdl.h"
-#include "common.h"
-#include "whisper.h"
-#include "gpt-2.h"
-
-#include <cassert>
-#include <cstdio>
-#include <fstream>
-#include <regex>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-
-// command-line parameters
-struct whisper_params {
-    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t voice_ms   = 10000;
-    int32_t capture_id = -1;
-    int32_t max_tokens = 32;
-    int32_t audio_ctx  = 0;
-
-    float vad_thold    = 0.6f;
-    float freq_thold   = 100.0f;
-
-    bool translate     = false;
-    bool print_special = false;
-    bool print_energy  = false;
-    bool no_timestamps = true;
-    bool use_gpu       = true;
-    bool flash_attn    = false;
-
-    std::string person    = "Santa";
-    std::string language  = "en";
-    std::string model_wsp = "models/ggml-base.en.bin";
-    std::string model_gpt = "models/ggml-gpt-2-117M.bin";
-    std::string speak     = "./examples/talk/speak";
-    std::string speak_file= "./examples/talk/to_speak.txt";
-    std::string fname_out;
-};
-
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-
-static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms      = std::stoi(argv[++i]); }
-        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
-        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
-        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
-        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
-        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
-        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
-        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
-        else if (arg == "-fa"  || arg == "--flash-attn")    { params.flash_attn    = true; }
-        else if (arg == "-p"   || arg == "--person")        { params.person        = argv[++i]; }
-        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
-        else if (arg == "-mg"  || arg == "--model-gpt")     { params.model_gpt     = argv[++i]; }
-        else if (arg == "-s"   || arg == "--speak")         { params.speak         = argv[++i]; }
-        else if (arg == "-sf"  || arg == "--speak_file")    { params.speak_file    = argv[++i]; }
-        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
-    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
-    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -ng,      --no-gpu        [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
-    fprintf(stderr, "  -fa,      --flash-attn    [%-7s] flash attention\n",                             params.flash_attn ? "true" : "false");
-    fprintf(stderr, "  -p NAME,  --person NAME   [%-7s] person name (for prompt selection)\n",          params.person.c_str());
-    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
-    fprintf(stderr, "  -mg FILE, --model-gpt     [%-7s] gpt model file\n",                              params.model_gpt.c_str());
-    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
-    fprintf(stderr, "  -sf FILE, --speak_file    [%-7s] file to pass to TTS\n",                         params.speak_file.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
-    fprintf(stderr, "\n");
-}
-
-static std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
-    const auto t_start = std::chrono::high_resolution_clock::now();
-
-    prob = 0.0f;
-    t_ms = 0;
-
-    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-    wparams.print_progress   = false;
-    wparams.print_special    = params.print_special;
-    wparams.print_realtime   = false;
-    wparams.print_timestamps = !params.no_timestamps;
-    wparams.translate        = params.translate;
-    wparams.no_context       = true;
-    wparams.single_segment   = true;
-    wparams.max_tokens       = params.max_tokens;
-    wparams.language         = params.language.c_str();
-    wparams.n_threads        = params.n_threads;
-
-    wparams.audio_ctx        = params.audio_ctx;
-
-    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
-        return "";
-    }
-
-    int prob_n = 0;
-    std::string result;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-
-        result += text;
-
-        const int n_tokens = whisper_full_n_tokens(ctx, i);
-        for (int j = 0; j < n_tokens; ++j) {
-            const auto token = whisper_full_get_token_data(ctx, i, j);
-
-            prob += token.p;
-            ++prob_n;
-        }
-    }
-
-    if (prob_n > 0) {
-        prob /= prob_n;
-    }
-
-    const auto t_end = std::chrono::high_resolution_clock::now();
-    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-
-    return result;
-}
-
-const std::string k_prompt =
-R"(This is a dialogue between {0} (A) and a person (B). The dialogue so far is:
-
-B: Hello {0}, how are you?
-A: I'm fine, thank you.
-{1}
-Here is how {0} (A) continues the dialogue:
-
-A:)";
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    // whisper init
-    struct whisper_context_params cparams = whisper_context_default_params();
-
-    cparams.use_gpu    = params.use_gpu;
-    cparams.flash_attn = params.flash_attn;
-
-    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
-
-    // gpt init
-
-    struct gpt2_context * ctx_gpt = gpt2_init(params.model_gpt.c_str());
-
-    // print some info about the processing
-    {
-        fprintf(stderr, "\n");
-        if (!whisper_is_multilingual(ctx_wsp)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-            }
-        }
-        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
-                __func__,
-                params.n_threads,
-                params.language.c_str(),
-                params.translate ? "translate" : "transcribe",
-                params.no_timestamps ? 0 : 1);
-
-        fprintf(stderr, "\n");
-    }
-
-
-    // init audio
-
-    audio_async audio(30*1000);
-    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
-        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
-        return 1;
-    }
-
-    audio.resume();
-
-    int n_iter = 0;
-
-    bool is_running  = true;
-    bool force_speak = false;
-
-    float prob0 = 0.0f;
-
-    std::vector<float> pcmf32_cur;
-    std::vector<float> pcmf32_prompt;
-
-    gpt2_set_prompt(ctx_gpt, "");
-
-    const int voice_id = rand()%6;
-
-    fprintf(stderr, "gpt-2: prompt:\n");
-    fprintf(stderr, "========================\n\n");
-    fprintf(stderr, "%s\n", ::replace(k_prompt, "{0}", params.person).c_str());
-    fprintf(stderr, "========================\n\n");
-
-    // main loop
-    while (is_running) {
-        // handle Ctrl + C
-        is_running = sdl_poll_events();
-
-        if (!is_running) {
-            break;
-        }
-
-        // delay
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-        int64_t t_ms = 0;
-
-        {
-            audio.get(2000, pcmf32_cur);
-
-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
-                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
-
-                audio.get(params.voice_ms, pcmf32_cur);
-
-                std::string text_heard;
-
-                if (!force_speak) {
-                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\[.*?\\]");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\(.*?\\)");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
-                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-
-                // take first line
-                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
-
-                // remove leading and trailing whitespace
-                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
-                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
-
-                const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(ctx_gpt, text_heard.c_str());
-
-                if (text_heard.empty() || tokens.empty() || force_speak) {
-                    fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
-                    audio.clear();
-
-                    continue;
-                }
-
-                force_speak = false;
-
-                fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", text_heard.c_str(), "\033[0m", (int) t_ms);
-
-                std::string prompt_base = gpt2_get_prompt(ctx_gpt);
-
-                std::string text_to_speak;
-
-                {
-                    prompt_base += "B: " + text_heard + "\n";
-
-                    std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
-
-                    text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
-                    //text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-                    text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));
-
-                    // remove first 2 lines of base prompt
-                    if (n_iter > 4) {
-                        {
-                            const size_t pos = prompt_base.find_first_of('\n');
-                            if (pos != std::string::npos) {
-                                prompt_base = prompt_base.substr(pos + 1);
-                            }
-                        }
-                        {
-                            const size_t pos = prompt_base.find_first_of('\n');
-                            if (pos != std::string::npos) {
-                                prompt_base = prompt_base.substr(pos + 1);
-                            }
-                        }
-                    }
-
-                    prompt_base += "A:" + text_to_speak + "\n";
-
-                    {
-                        prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
-
-                        printf("===============\n");
-                        printf("prompt:\n");
-                        printf("%s\n", prompt.c_str());
-                        printf("===============\n");
-                    }
-                }
-
-                //printf("========================\n");
-                //printf("gpt-2: prompt_base:\n%s\n", prompt_base.c_str());
-                //printf("========================\n");
-
-                gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
-
-                text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
-                speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
-
-                audio.clear();
-
-                ++n_iter;
-            }
-        }
-    }
-
-    audio.pause();
-
-    whisper_print_timings(ctx_wsp);
-    whisper_free(ctx_wsp);
-
-    return 0;
-}
diff --git a/examples/talk/to_speak.txt b/examples/talk/to_speak.txt
new file mode 100644
index 00000000000..c225e37c573
--- /dev/null
+++ b/examples/talk/to_speak.txt
@@ -0,0 +1 @@
+ I'm Santa Claus. I'm here to help you.
\ No newline at end of file