原文
无需绑定,只需制作个带单个包含
的虚C文件
,然后就可开始构建由llama.cpp
提供支持的D应用
,仅此而已,除了制作虚文件
外,无需额外
工作.
在窗口
上使用dmdv2.107
来测试.
代码
//llamad.c:
#include "llama.h"
从llama.cpp
的简单
的移植D版本
示例:
//llamad.d:
module llama_d;
import std.string;
import std.stdio;
import llamad; //imports llamad.c
//pragma(msg, __traits(allMembers, llamad));
void main(string[] args)
{if (args.length < 3) {writeln("LLAMA D DEMO USAGE: llama-d <path_to_model> <your_prompt>");return;}llama_backend_init();llama_numa_init(GGML_NUMA_STRATEGY_DISABLED);auto mparams = llama_model_default_params();//`mparams.n_gpu_layers=30;`卸载层到`GPU`以加速推导auto ctx_params = llama_context_default_params();ctx_params.n_ctx = 2048;import std.parallelism;ctx_params.n_threads = totalCPUs-1;ctx_params.n_threads_batch = ctx_params.n_threads_batch == -1 ? ctx_params.n_threads : ctx_params.n_threads_batch;llama_model* model = llama_load_model_from_file(toStringz(args[1]), mparams);llama_context*ctx = llama_new_context_with_model(model, ctx_params);const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;const bool allow_special = false;string prompt = args[2];if (!prompt.length)return;//转换`提示`为嵌入llama_token[] embd_inp;embd_inp.length = prompt.length;writeln("tokenizing...");auto n_of_tok = llama_tokenize(llama_get_model(ctx), prompt.ptr, cast(int) prompt.length, embd_inp.ptr, cast(int) embd_inp.length, add_bos, allow_special);embd_inp.length = n_of_tok;if (!n_of_tok) {writeln("no tokens generated, something gone wrong");return;}writeln("input has ", n_of_tok, " tokens");foreach (id; embd_inp) {write(llama_token_to_piece(ctx, id));}writeln();//包括提示的`序列总长度`const int n_len = 128;const int n_ctx = llama_n_ctx(ctx);const int n_kv_req = cast(int)(embd_inp.length + (n_len - embd_inp.length));if (n_kv_req > n_ctx) {writeln("error: prompt is too long");return;}writeln("building batch");//创建大小为`512`的`llama_batch`,//用该对象来提交待解码令牌数据llama_batch batch = llama_batch_init(512, 0, 1);//求值初始提示for (size_t i = 0; i < embd_inp.length; i++) {//注意,`seq_pos_id=[0]`是必需的,因为至少应该有一个令牌llama_batch_add(batch, embd_inp[i], cast(int) i, [0], false);}//`llama_decode`仅针对提示的最后令牌输出逻辑batch.logits[batch.n_tokens - 1] = true;writeln("decoding batch");if (llama_decode(ctx, batch) != 0) {writeln("llama_decode() failed");return;}//主循环int n_cur = batch.n_tokens;int n_decode = 0;const auto t_main_start = ggml_time_us();while (n_cur <= n_len) {//对下个令牌采样{auto n_vocab = llama_n_vocab(model);auto logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);llama_token_data[] candidates;candidates.length = n_vocab;for (llama_token token_id = 0; token_id < n_vocab; token_id++) {candidates ~= llama_token_data(token_id, logits[token_id], 0.0f);}llama_token_data_array candidates_p = { candidates.ptr, cast(int) candidates.length, false };//对最有可能的令牌采样const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);//是流尾吗if (new_token_id == llama_token_eos(model) || n_cur == n_len) {writeln();break;}writef("%s", llama_token_to_piece(ctx, new_token_id));//准备下一批llama_batch_clear(batch);//为下次求值,压此新令牌llama_batch_add(batch, new_token_id, n_cur, [0], true);n_decode += 1;}n_cur += 1;//用`转换模型`求值当前批if (llama_decode(ctx, batch)) {writefln("%s : failed to eval, return code %d\n", __FUNCTION__, 1);return;}}const auto t_main_end = ggml_time_us();llama_print_timings(ctx);writeln();//清理llama_batch_free(batch);llama_free(ctx);llama_free_model(model);llama_backend_free();
}
void llama_batch_add( ref llama_batch batch, llama_token id, llama_pos pos, const llama_seq_id[] seq_ids, bool logits) {batch.token [batch.n_tokens] = id;batch.pos [batch.n_tokens] = pos;batch.n_seq_id[batch.n_tokens] = cast(int) seq_ids.length;for (size_t i = 0; i < seq_ids.length; ++i) {batch.seq_id[batch.n_tokens][i] = seq_ids[i];}batch.logits [batch.n_tokens] = logits;batch.n_tokens++;
}
string llama_token_to_piece(llama_context* ctx, llama_token token) {char[] result;result.length = 8;const int n_tokens = llamad.llama_token_to_piece(llama_get_model(ctx), token, result.ptr, cast(int) result.length);if (n_tokens < 0) {result.length = -n_tokens;int check = llamad.llama_token_to_piece(llama_get_model(ctx), token, result.ptr, cast(int) result.length);assert(check == -n_tokens);} else {result.length = n_tokens;}return cast(string) result;
}
void llama_batch_clear(ref llama_batch batch) {batch.n_tokens = 0;
}
构建
用此命令在llama.cpp
目录中构建
(我一直在用CUDA
,但也可在没有它时使用)
dmd llama-d.d llamad.c -m64 build/ggml_static.lib build/llama.lib -L/LIBPATH:"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.3/lib/x64" cuda.lib cudart.lib cufft.lib cublas.lib ucrtd.lib -L/NODEFAULTLIB:libucrt.lib -L/NODEFAULTLIB:libcmt.lib msvcprtd.lib
运行
llama-d "E:\ML\pretrained\speechless-llama2-hermes-orca-platypus-wizardlm-13b.Q5_K_M.gguf" "How to quit vim "
或用基于Karpathy
的llama2.c
代码的纯D
实现
llama