开源大模型框架llama.cpp使用C++ api开发入门_llama.cpp 使用

作者：花生_TL007 | 2024-02-22 03:50:19

踩

llama.cpp 使用

llama.cpp是一个C++编写的轻量级开源类AIGC大模型框架，可以支持在消费级普通设备上本地部署运行大模型，以及作为依赖库集成的到应用程序中提供类GPT的功能。

以下基于llama.cpp的源码利用C++ api来开发实例demo演示加载本地模型文件并提供GPT文本生成。

项目结构

llamacpp_starter
	- llama.cpp-b1547
	- src
	  |- main.cpp
	- CMakeLists.txt

1
2
3
4
5
6

CMakeLists.txt

cmake_minimum_required(VERSION 3.15)

# this only works for unix, xapian source code not support compile in windows yet

project(llamacpp_starter)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

add_subdirectory(llama.cpp-b1547)

include_directories(
    ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547
    ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547/common
)

file(GLOB SRC
    src/*.h
    src/*.cpp
)

add_executable(${PROJECT_NAME} ${SRC})

target_link_libraries(${PROJECT_NAME}
    common
    llama
)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

main.cpp

#include <iostream>
#include <string>
#include <vector>
#include "common.h"
#include "llama.h"

int main(int argc, char** argv)
{
	bool numa_support = false;
	const std::string model_file_path = "./llama-ggml.gguf";
	const std::string prompt = "once upon a time"; // input words
	const int n_len = 32; 	// total length of the sequence including the prompt

	// set gpt params
	gpt_params params;
	params.model = model_file_path;
	params.prompt = prompt;


	// init LLM
	llama_backend_init(false);

	// load model
	llama_model_params model_params = llama_model_default_params();
	//model_params.n_gpu_layers = 99; // offload all layers to the GPU

	llama_model* model = llama_load_model_from_file(model_file_path.c_str(), model_params);

	if (model == NULL)
	{
		std::cerr << __func__ << " load model file error" << std::endl;
		return 1;
	}

	// init context
	llama_context_params ctx_params = llama_context_default_params();

	ctx_params.seed = 1234;
	ctx_params.n_ctx = 2048;
	ctx_params.n_threads = params.n_threads;
	ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;

	llama_context* ctx = llama_new_context_with_model(model, ctx_params);

	if (ctx == NULL)
	{
		std::cerr << __func__ << " failed to create the llama_context" << std::endl;
		return 1;
	}

	// tokenize the prompt
	std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);

	const int n_ctx = llama_n_ctx(ctx);
	const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());

	// make sure the KV cache is big enough to hold all the prompt and generated tokens
	if (n_kv_req > n_ctx)
	{
		std::cerr << __func__ << " error: n_kv_req > n_ctx, the required KV cache size is not big enough" << std::endl;
		std::cerr << __func__ << " either reduce n_parallel or increase n_ctx" << std::endl;
		return 1;
	}

	// print the prompt token-by-token
	for (auto id : tokens_list)
		std::cout << llama_token_to_piece(ctx, id) << " ";
	std::cout << std::endl;

	// create a llama_batch with size 512
	// we use this object to submit token data for decoding
	llama_batch batch = llama_batch_init(512, 0, 1);

	// evaluate the initial prompt
	for (size_t i = 0; i < tokens_list.size(); i++)
		llama_batch_add(batch, tokens_list[i], i, { 0 }, false);

	// llama_decode will output logits only for the last token of the prompt
	batch.logits[batch.n_tokens - 1] = true;

	if (llama_decode(ctx, batch) != 0)
	{
		std::cerr << __func__ << " llama_decode failed" << std::endl;
		return 1;
	}

	// main loop to generate words
	int n_cur = batch.n_tokens;
	int n_decode = 0;

	const auto t_main_start = ggml_time_us();

	while (n_cur <= n_len)
	{
		// sample the next token
		auto n_vocab = llama_n_vocab(model);
		auto* logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);

		std::vector<llama_token_data> candidates;
		candidates.reserve(n_vocab);

		for (llama_token token_id = 0; token_id < n_vocab; token_id++)
		{
			candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
		}

		llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

		// sample the most likely token
		const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

		// is it an end of stream?
		if (new_token_id == llama_token_eos(model) || n_cur == n_len)
		{
			std::cout << std::endl;
			break;
		}

		std::cout << llama_token_to_piece(ctx, new_token_id) << " ";

		// prepare the next batch
		llama_batch_clear(batch);

		// push this new token for next evaluation
		llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);

		n_decode += 1;

		n_cur += 1;

		// evaluate the current batch with the transformer model
		if (llama_decode(ctx, batch))
		{
			std::cerr << __func__ << " failed to eval" << std::endl;
			return 1;
		}
	}
	std::cout << std::endl;

	const auto t_main_end = ggml_time_us();

	std::cout << __func__ << " decoded " << n_decode << " tokens in " << (t_main_end - t_main_start) / 1000000.0f << " s, speed: " << n_decode / ((t_main_end - t_main_start) / 1000000.0f) << " t / s" << std::endl;

	llama_print_timings(ctx);

	llama_batch_free(batch);

	// free context
	llama_free(ctx);
	llama_free_model(model);

	// free LLM
	llama_backend_free();

	return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

注：

llama支持的模型文件需要自己去下载，推荐到huggingface官网下载转换好的gguf格式文件
llama.cpp编译可以配置多种类型的增强选项，比如支持CPU/GPU加速，数据计算加速库

源码

llamacpp_starter

本文由博客一文多发平台 OpenWrite 发布！

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/花生_TL007/article/detail/128389