Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more choices to quantization tool. Post processing after sim_anneal(). (optimizer.py/ext_quant.cpp) #712

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
10 changes: 5 additions & 5 deletions exllamav2/exllamav2_ext/cpp/sampling.cpp
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ void apply_rep_penalty_cpu
// {
// if (g_rep_mask) free(g_rep_mask);
// g_vocab_size = vocab_size;
// g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool));
// g_rep_mask = (bool*) calloc(1, g_vocab_size * sizeof(bool));
// }
// memset(g_rep_mask, 0, g_vocab_size * sizeof(bool));
bool* g_rep_mask = (bool*) calloc(vocab_size, sizeof(bool));
@@ -655,7 +655,7 @@ int tfs_cpu

int nc = sort_descending(num_candidates, temp_probs, temp_indices, num_candidates);

float* derivative = (float*) malloc(nc * sizeof(float));
float* derivative = (float*) calloc(1, nc * sizeof(float));
float dsum = 0.0f;
for (int i = 0; i < nc - 2; i++)
{
@@ -759,9 +759,9 @@ int typical_cpu

int r_candidates = pre_sort_descending(num_candidates, temp_probs, temp_indices);

float* temp = (float*) malloc(r_candidates * sizeof(float));
int* entropy_dev_order = (int*) malloc(r_candidates * sizeof(int));
int* temp_indices_2 = (int*) malloc(r_candidates * sizeof(int));
float* temp = (float*) calloc(1, r_candidates * sizeof(float));
int* entropy_dev_order = (int*) calloc(1, r_candidates * sizeof(int));
int* temp_indices_2 = (int*) calloc(1, r_candidates * sizeof(int));

float neg_entropy = 0.0f;
for (int i = 0; i < r_candidates; i++)
10 changes: 5 additions & 5 deletions exllamav2/exllamav2_ext/cuda/cache.cu
Original file line number Diff line number Diff line change
@@ -165,16 +165,16 @@ __global__ void fp16_to_q_kv_paged_kernel

int page = block_table[pages_per_seq * y + x];
int seqlen = cache_seqlens[y];
int vx_a = page_size * x;
int px_a = seqlen - vx_a;
int vx_a = (int64_t)page_size * x;
int px_a = (int64_t)seqlen - vx_a;
int px_b = px_a + q_len;

if (dim % BLOCKSIZE_Q)
{
while ((px_a * dim) % BLOCKSIZE_Q) px_a--;
while ((px_b * dim) % BLOCKSIZE_Q) px_b++;
}

px_a = max(px_a, 0);
px_b = min(px_b, page_size);

@@ -346,7 +346,7 @@ __global__ void q_to_fp16_kv_paged_kernel
int seqlen = cache_seqlens[y];
if (!seqlen) return;

int vx_a = page_size * x;
int vx_a = (int64_t)page_size * x;
int vx_b = min(vx_a + page_size, seqlen);

if (dim < BLOCKSIZE_Q)
@@ -491,4 +491,4 @@ void array_q_to_fp16_kv_cuda
v_in, v_scales, v_out,
dim, offset, stride
);
}
}
13 changes: 11 additions & 2 deletions exllamav2/exllamav2_ext/cuda/q_matrix.cu
Original file line number Diff line number Diff line change
@@ -603,9 +603,18 @@ bool QMatrix::make_sequential(const uint32_t* cpu_g_idx, cudaStream_t stream)
return false;
}

// Zero out the allocated memory
size_t mem_size = (height / 8) * width * sizeof(uint32_t);
err = cudaMemset(cuda_new_qweight, 0, mem_size);
if (err != cudaSuccess) {;;;
printf("CUDA memset failed: %s\n", cudaGetErrorString(err));
cudaFree(cuda_new_qweight); // Free the allocated memory in case of error
return err;
}

uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
uint32_t* cpu_x_map = (uint32_t*) calloc(1, height * sizeof(uint32_t));
uint32_t* cpu_x_map_inv = (uint32_t*) calloc(1, height * sizeof(uint32_t));

// Group histogram

2 changes: 1 addition & 1 deletion exllamav2/exllamav2_ext/cuda/util.cu
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@

void print_global_mem(const half* ptr, int rows, int columns, int stride)
{
half* temp = (half*) malloc(rows * columns * sizeof(half));
half* temp = (half*) calloc(1, rows * columns * sizeof(half));

cudaDeviceSynchronize();
cudaMemcpyAsync(temp, ptr, rows * columns * sizeof(half), cudaMemcpyDeviceToHost);
Loading