Merge branch 'concedo_experimental' into croco_nex_0

Nexesenex · Nexesenex · commit 0cd8d48d6eae · 2025-01-29T21:08:18.000+01:00
diff --git a/Makefile b/Makefile
@@ -30,14 +30,14 @@ endif
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
-	ifneq ($(UNAME_P),arm)
-		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
-		ifeq ($(SYSCTL_M),1)
-			# UNAME_P := arm
-			# UNAME_M := arm64
-			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
-		endif
-	endif
+ifneq ($(UNAME_P),arm)
+	SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
+ifeq ($(SYSCTL_M),1)
+	# UNAME_P := arm
+	# UNAME_M := arm64
+	warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
+endif
+endif
 endif
 
 #
@@ -112,9 +112,9 @@ ifeq ($(UNAME_S),Darwin)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 	CLANG_VER = $(shell clang -v 2>&1 | head -n 1 | awk 'BEGIN {FS="[. ]"};{print $$1 $$2 $$4}')
-	ifeq ($(CLANG_VER),Appleclang15)
-		LDFLAGS += -ld_classic
-	endif
+ifeq ($(CLANG_VER),Appleclang15)
+	LDFLAGS += -ld_classic
+endif
 endif
 ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -pthread
@@ -149,48 +149,48 @@ CXXV := $(shell $(CXX) --version | head -n 1)
 # TODO: probably these flags need to be tweaked on some architectures
 # feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
-	# Use all CPU extensions that are available:
+# Use all CPU extensions that are available:
 # old library NEEDS mf16c to work. so we must build with it. new one doesnt
-	ifeq ($(OS),Windows_NT)
-		ifdef LLAMA_PORTABLE
-		CFLAGS +=
-		NONECFLAGS +=
-		SIMPLECFLAGS += -mavx -msse3 -mssse3
-		SIMPLERCFLAGS += -msse3 -mssse3
-		ifdef LLAMA_NOAVX2
-			FULLCFLAGS += -msse3 -mssse3 -mavx
-		else
-			FULLCFLAGS += -mavx2 -msse3 -mssse3 -mfma -mf16c -mavx
-		endif
-		else
-		CFLAGS += -march=native -mtune=native
-		endif
-	else
-		ifdef LLAMA_PORTABLE
-		CFLAGS +=
-		NONECFLAGS +=
-		SIMPLECFLAGS += -mavx -msse3 -mssse3
-		SIMPLERCFLAGS += -msse3 -mssse3
-		ifdef LLAMA_NOAVX2
-			FULLCFLAGS += -msse3 -mssse3 -mavx
-		else
-			FULLCFLAGS += -mavx2 -msse3 -mssse3 -mfma -mf16c -mavx
-		endif
-		else
-		CFLAGS += -march=native -mtune=native
-		endif
-	endif
+ifeq ($(OS),Windows_NT)
+ifdef LLAMA_PORTABLE
+	CFLAGS +=
+	NONECFLAGS +=
+	SIMPLECFLAGS += -mavx -msse3 -mssse3
+	SIMPLERCFLAGS += -msse3 -mssse3
+ifdef LLAMA_NOAVX2
+	FULLCFLAGS += -msse3 -mssse3 -mavx
+else
+	FULLCFLAGS += -mavx2 -msse3 -mssse3 -mfma -mf16c -mavx
+endif
+else
+	CFLAGS += -march=native -mtune=native
+endif
+else
+ifdef LLAMA_PORTABLE
+	CFLAGS +=
+	NONECFLAGS +=
+	SIMPLECFLAGS += -mavx -msse3 -mssse3
+	SIMPLERCFLAGS += -msse3 -mssse3
+ifdef LLAMA_NOAVX2
+	FULLCFLAGS += -msse3 -mssse3 -mavx
+else
+	FULLCFLAGS += -mavx2 -msse3 -mssse3 -mfma -mf16c -mavx
+endif
+else
+	CFLAGS += -march=native -mtune=native
+endif
+endif
 endif
 
 ifndef LLAMA_NO_ACCELERATE
-	# Mac M1 - include Accelerate framework.
-	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
-	ifeq ($(UNAME_S),Darwin)
-		CFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
-		CXXFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
-		LDFLAGS += -framework Accelerate
-		OBJS += ggml-blas.o
-	endif
+# Mac M1 - include Accelerate framework.
+# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
+ifeq ($(UNAME_S),Darwin)
+	CFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
+	CXXFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
+	LDFLAGS += -framework Accelerate
+	OBJS += ggml-blas.o
+endif
 endif
 
 # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
@@ -315,17 +315,17 @@ ggml_v3-cuda.o: otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h
 endif # LLAMA_CUBLAS
 
 ifdef LLAMA_HIPBLAS
-	ifeq ($(wildcard /opt/rocm),)
-		ROCM_PATH	?= /usr
-		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
-		HCC         := $(ROCM_PATH)/bin/hipcc
-		HCXX        := $(ROCM_PATH)/bin/hipcc
-	else
-		ROCM_PATH	?= /opt/rocm
-		GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
-		HCC         := $(ROCM_PATH)/llvm/bin/clang
-		HCXX        := $(ROCM_PATH)/llvm/bin/clang++
-	endif
+ifeq ($(wildcard /opt/rocm),)
+	ROCM_PATH	?= /usr
+	GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
+	HCC         := $(ROCM_PATH)/bin/hipcc
+	HCXX        := $(ROCM_PATH)/bin/hipcc
+else
+	ROCM_PATH	?= /opt/rocm
+	GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+	HCC         := $(ROCM_PATH)/llvm/bin/clang
+	HCXX        := $(ROCM_PATH)/llvm/bin/clang++
+endif
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
@@ -370,26 +370,26 @@ ggml-metal.o: ggml/src/ggml-metal/ggml-metal.m ggml/src/ggml-metal/ggml-metal-im
 endif # LLAMA_METAL
 
 ifneq ($(filter aarch64%,$(UNAME_M)),)
-	# Apple M1, M2, etc.
-	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	ifdef LLAMA_PORTABLE
-		CFLAGS +=
-		CXXFLAGS +=
-	else
-		# sve is cooked on termux so we are disabling it
-		ifeq ($(UNAME_O), Android)
-			ifneq ($(findstring clang, $(CCV)), )
-				CFLAGS += -mcpu=native+nosve
-				CXXFLAGS += -mcpu=native+nosve
-			else
-				CFLAGS += -mcpu=native
-				CXXFLAGS += -mcpu=native
-			endif
-		else
-			CFLAGS += -mcpu=native
-			CXXFLAGS += -mcpu=native
-		endif
-	endif
+# Apple M1, M2, etc.
+# Raspberry Pi 3, 4, Zero 2 (64-bit)
+ifdef LLAMA_PORTABLE
+	CFLAGS +=
+	CXXFLAGS +=
+else
+# sve is cooked on termux so we are disabling it
+ifeq ($(UNAME_O), Android)
+ifneq ($(findstring clang, $(CCV)), )
+	CFLAGS += -mcpu=native+nosve
+	CXXFLAGS += -mcpu=native+nosve
+else
+	CFLAGS += -mcpu=native
+	CXXFLAGS += -mcpu=native
+endif
+else
+	CFLAGS += -mcpu=native
+	CXXFLAGS += -mcpu=native
+endif
+endif
 endif
 
 ifneq ($(filter armv6%,$(UNAME_M)),)
@@ -409,10 +409,10 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
-	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS   += -mcpu=power9
-		CXXFLAGS += -mcpu=power9
-	endif
+ifneq (,$(findstring POWER9,$(POWER9_M)))
+	CFLAGS   += -mcpu=power9
+	CXXFLAGS += -mcpu=power9
+endif
 endif
 
 
@@ -427,49 +427,49 @@ NOTIFY_MSG =
 
 ifeq ($(OS),Windows_NT)
 	DEFAULT_BUILD = $(CXX) $(CXXFLAGS)  $^ -shared -o $@.dll $(LDFLAGS)
-	ifdef LLAMA_PORTABLE
+ifdef LLAMA_PORTABLE
 	FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
 	NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS)
-	endif
+endif
 
-	ifdef LLAMA_CLBLAST
+ifdef LLAMA_CLBLAST
 	CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS)
-	endif
-	ifdef LLAMA_VULKAN
+endif
+ifdef LLAMA_VULKAN
 	VULKAN_BUILD = $(CXX) $(CXXFLAGS) $^ lib/vulkan-1.lib -shared -o $@.dll $(LDFLAGS)
-	endif
-
-	ifdef LLAMA_CUBLAS
-		CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.dll $(CUBLASLD_FLAGS) $(LDFLAGS)
-	endif
-	ifdef LLAMA_HIPBLAS
-		HIPBLAS_BUILD = $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.dll $(HIPLDFLAGS) $(LDFLAGS)
-	endif
+endif
+
+ifdef LLAMA_CUBLAS
+	CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.dll $(CUBLASLD_FLAGS) $(LDFLAGS)
+endif
+ifdef LLAMA_HIPBLAS
+	HIPBLAS_BUILD = $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.dll $(HIPLDFLAGS) $(LDFLAGS)
+endif
 else
 	DEFAULT_BUILD = $(CXX) $(CXXFLAGS)  $^ -shared -o $@.so $(LDFLAGS)
-	ifdef LLAMA_PORTABLE
-	ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
+ifdef LLAMA_PORTABLE
+ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	FAILSAFE_BUILD = $(CXX) $(CXXFLAGS)  $^ -shared -o $@.so $(LDFLAGS)
 	NOAVX2_BUILD = $(CXX) $(CXXFLAGS)  $^ -shared -o $@.so $(LDFLAGS)
-	endif
-	endif
-
-	ifdef LLAMA_CLBLAST
-		ifeq ($(UNAME_S),Darwin)
-			CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -shared -o $@.so $(LDFLAGS)
-		else
-			CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -shared -o $@.so $(LDFLAGS)
-		endif
-	endif
-	ifdef LLAMA_CUBLAS
-		CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS)
-	endif
-	ifdef LLAMA_HIPBLAS
-		HIPBLAS_BUILD = $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.so $(HIPLDFLAGS) $(LDFLAGS)
-	endif
-	ifdef LLAMA_VULKAN
-		VULKAN_BUILD = $(CXX) $(CXXFLAGS) $^ -lvulkan -shared -o $@.so $(LDFLAGS)
-	endif
+endif
+endif
+
+ifdef LLAMA_CLBLAST
+ifeq ($(UNAME_S),Darwin)
+	CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -shared -o $@.so $(LDFLAGS)
+else
+	CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -shared -o $@.so $(LDFLAGS)
+endif
+endif
+ifdef LLAMA_CUBLAS
+	CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.so $(CUBLASLD_FLAGS) $(LDFLAGS)
+endif
+ifdef LLAMA_HIPBLAS
+	HIPBLAS_BUILD = $(HCXX) $(CXXFLAGS) $(HIPFLAGS) $^ -shared -o $@.so $(HIPLDFLAGS) $(LDFLAGS)
+endif
+ifdef LLAMA_VULKAN
+	VULKAN_BUILD = $(CXX) $(CXXFLAGS) $^ -lvulkan -shared -o $@.so $(LDFLAGS)
+endif
 endif
 
 ifndef LLAMA_CLBLAST
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -1744,7 +1744,7 @@ uint8_t* scale_down_image(uint8_t* input_image, int& nx, int& ny, int nc, int ma
     return resized_image;
 }
 
-bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
+bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img, const int maxdims) {
     int nx, ny, nc;
     auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
     if (!data) {
@@ -1753,7 +1753,6 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
     }
 
     float maxaspect = 4.0f;
-    int maxdims = 2048;
 
     //check if image needs downscaling
     if (nx > maxdims || ny > maxdims) {
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
@@ -76,7 +76,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
-CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img, const int maxdims);
 
 /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
 CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -473,7 +473,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
 
 struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
     clip_image_u8 * img = clip_image_u8_init();
-    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
+    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img, 2048)) {
         clip_image_u8_free(img);
         LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
         return NULL;
diff --git a/expose.h b/expose.h
@@ -45,6 +45,7 @@ struct load_model_inputs
     const int draft_gpulayers = 999;
     const float draft_gpusplit[tensor_split_max] = {};
     const char * mmproj_filename = nullptr;
+    const int visionmaxres = 2048;
     const bool use_mmap = false;
     const bool use_mlock = false;
     const bool use_smartcontext = false;
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -103,6 +103,7 @@ static clip_image_u8 * clp_img_data = nullptr; //most recent image
 static std::vector<llava_image> llava_images;
 static std::string llava_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache
 static int current_llava_identifier = LLAVA_TOKEN_IDENTIFIER_A;
+static int vision_max_res = 2048;
 
 static kcpp_params * kcpp_data = nullptr;
 static int max_context_limit_at_load = 0;
@@ -1951,6 +1952,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
     = mpt_ctx_v3.hparams.n_ctx = kcpp_data->n_ctx;
 
+    vision_max_res = inputs.visionmaxres;
+
     //determine rope scaling params
     float rope_freq_scale = 1.0f;
     float rope_freq_base = 10000.0f;
@@ -2198,7 +2201,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         }
         if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL)
         {
-            printf("Qwen2VL detected! Mrope will be used!\n");
+            printf("Qwen2VL detected! Mrope will be used, and context shift will be disabled!\n");
+            kcpp_data->use_contextshift = false;
         }
         model_params.main_gpu = cu_parseinfo_maindevice;
 
@@ -3432,7 +3436,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         {
             std::string llava_image = llava_images[i].b64data;
             const std::vector<uint8_t> image_buffer = kcpp_base64_decode(llava_image);
-            if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), clp_img_data))
+            if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), clp_img_data, vision_max_res))
             {
                 //failed to load image
                 printf("\nError: Clip image %d failed to load!",i);
diff --git a/koboldcpp.py b/koboldcpp.py

Original file line number	Diff line number	Diff line change
`@@ -1744,7 +1744,7 @@ uint8_t* scale_down_image(uint8_t* input_image, int& nx, int& ny, int nc, int ma`
`1744`	`1744`	`return resized_image;`
`1745`	`1745`	`}`
`1746`	`1746`
`1747`		`-bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {`
	`1747`	`+bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img, const int maxdims) {`
`1748`	`1748`	`int nx, ny, nc;`
`1749`	`1749`	`auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);`
`1750`	`1750`	`if (!data) {`
`@@ -1753,7 +1753,6 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length`
`1753`	`1753`	`}`
`1754`	`1754`
`1755`	`1755`	`float maxaspect = 4.0f;`
`1756`		`- int maxdims = 2048;`
`1757`	`1756`
`1758`	`1757`	`//check if image needs downscaling`
`1759`	`1758`	`if (nx > maxdims \|\| ny > maxdims) {`