Skip to content

Commit b5dbcf6

Browse files
committed
Smoothing factor backport
1 parent f87f7b8 commit b5dbcf6

File tree

5 files changed

+23
-6
lines changed

5 files changed

+23
-6
lines changed

common/sampling.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ static void sampler_queue(
132132
const float temp = params.temp;
133133
const float dynatemp_range = params.dynatemp_range;
134134
const float dynatemp_exponent = params.dynatemp_exponent;
135+
const float smoothing_factor = params.smoothing_factor;
135136
const int32_t top_k = params.top_k;
136137
const float top_p = params.top_p;
137138
const float min_p = params.min_p;
@@ -147,10 +148,10 @@ static void sampler_queue(
147148
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
148149
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
149150
case llama_sampler_type::TEMPERATURE:
150-
if (dynatemp_range > 0) {
151+
if (dynatemp_range > 0 || smoothing_factor > 0) {
151152
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
152153
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
153-
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
154+
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
154155
} else {
155156
llama_sample_temp(ctx_main, &cur_p, temp);
156157
}

common/sampling.h

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ typedef struct llama_sampling_params {
3131
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
3232
float dynatemp_range = 0.00f; // 0.0 = disabled
3333
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
34+
float smoothing_factor = 0.0f; // controls the quadratic adjustment in smooth sampling
3435
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
3536
float penalty_repeat = 1.00f; // 1.0 = disabled
3637
float penalty_freq = 0.00f; // 0.0 = disabled

examples/server/server.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,7 @@ struct server_context {
839839
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
840840
slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
841841
slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
842+
slot.sparams.smoothing_factor = json_value(data, "smoothing_factor", default_sparams.smoothing_factor);
842843
slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
843844
slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
844845
slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);

llama.cpp

+15-2
Original file line numberDiff line numberDiff line change
@@ -12183,14 +12183,27 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
1218312183
}
1218412184
}
1218512185

12186-
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
12186+
void llama_sample_entropy(struct llama_context* ctx, llama_token_data_array* candidates_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) {
1218712187
const int64_t t_start_sample_us = ggml_time_us();
1218812188

1218912189
// no need to do anything if there is only one (or zero) candidates
12190-
if(candidates_p->size <= 1) {
12190+
if (candidates_p->size <= 1) {
1219112191
return;
1219212192
}
1219312193

12194+
// Apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise.
12195+
if (smoothing_factor > 0 && candidates_p->size > 1) {
12196+
llama_sample_softmax(ctx, candidates_p);
12197+
float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation
12198+
12199+
// Apply quadratic transformation using the smoothing_factor
12200+
for (size_t i = 0; i < candidates_p->size; ++i) {
12201+
float logit_shifted = candidates_p->data[i].logit - h;
12202+
candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h;
12203+
}
12204+
llama_sample_softmax(ctx, candidates_p);
12205+
}
12206+
1219412207
// Calculate maximum possible entropy
1219512208
float max_entropy = -logf(1.0f / candidates_p->size);
1219612209

llama.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -864,13 +864,14 @@ extern "C" {
864864
float p,
865865
size_t min_keep);
866866

867-
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
867+
/// @details Dynamic temperature implementation + Smooth Sampling implementations wrapped into one function, no research papers available
868868
LLAMA_API void llama_sample_entropy(
869869
struct llama_context * ctx,
870870
llama_token_data_array * candidates_p,
871871
float min_temp,
872872
float max_temp,
873-
float exponent_val);
873+
float exponent_val,
874+
float smoothing_factor);
874875

875876
LLAMA_API void llama_sample_temp(
876877
struct llama_context * ctx,

0 commit comments

Comments
 (0)