From 7f90ff7aecd973a17c77a7248b9112401eac4c97 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Feb 2025 18:36:23 +0100 Subject: [PATCH 1/9] chore(llama-ggml): drop deprecated backend (#4775) The GGML format is now dead, since in the next version of LocalAI we already bring many breaking compatibility changes, taking the occasion also to drop ggml support (pre-gguf). Signed-off-by: Ettore Di Giacinto --- Makefile | 38 +--- backend/go/llm/llama-ggml/llama.go | 204 ------------------ backend/go/llm/llama-ggml/main.go | 19 -- core/http/app_test.go | 71 ------ docs/content/docs/features/text-generation.md | 17 +- pkg/model/initializers.go | 6 +- 6 files changed, 7 insertions(+), 348 deletions(-) delete mode 100644 backend/go/llm/llama-ggml/llama.go delete mode 100644 backend/go/llm/llama-ggml/main.go diff --git a/Makefile b/Makefile index 7edb6f6a0464..790c6e6dca1b 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,6 @@ BINARY_NAME=local-ai DETECT_LIBS?=true # llama.cpp versions -GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp -GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be CPPLLAMA_VERSION?=d774ab3acc4fee41fbed6dbfc192b57d5f79f34b # whisper.cpp version @@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas) LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib export CXX=$(ROCM_HOME)/llvm/bin/clang++ export CC=$(ROCM_HOME)/llvm/bin/clang - # llama-ggml has no hipblas support, so override it here. export STABLE_BUILD_TYPE= export GGML_HIP=1 GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101 @@ -188,7 +185,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback -ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper @@ -222,19 +218,6 @@ endif all: help -## go-llama.cpp -sources/go-llama.cpp: - mkdir -p sources/go-llama.cpp - cd sources/go-llama.cpp && \ - git init && \ - git remote add origin $(GOLLAMA_REPO) && \ - git fetch origin && \ - git checkout $(GOLLAMA_VERSION) && \ - git submodule update --init --recursive --depth 1 --single-branch - -sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp - $(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a - ## bark.cpp sources/bark.cpp: git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \ @@ -310,19 +293,17 @@ sources/whisper.cpp: sources/whisper.cpp/libwhisper.a: sources/whisper.cpp cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a -get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp +get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp replace: $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go $(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper - $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp dropreplace: $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go $(GOCMD) mod edit -dropreplace github.com/mudler/go-piper - $(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp prepare-sources: get-sources replace $(GOCMD) mod download @@ -330,7 +311,6 @@ prepare-sources: get-sources replace ## GENERIC rebuild: ## Rebuilds the project $(GOCMD) clean -cache - $(MAKE) -C sources/go-llama.cpp clean $(MAKE) -C sources/whisper.cpp clean $(MAKE) -C sources/go-piper clean $(MAKE) build @@ -434,7 +414,7 @@ run: prepare ## run local-ai test-models/testmodel.ggml: mkdir test-models mkdir test-dir - wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml + wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav @@ -449,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs export GO_TAGS="tts debug" $(MAKE) prepare-test HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ - $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) - $(MAKE) test-llama + $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) $(MAKE) test-llama-gguf $(MAKE) test-tts $(MAKE) test-stablediffusion @@ -479,10 +458,6 @@ teardown-e2e: rm -rf $(TEST_DIR) || true docker stop $$(docker ps -q --filter ancestor=localai-tests) -test-llama: prepare-test - TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ - $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) - test-llama-gguf: prepare-test TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) @@ -760,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc mkdir -p backend-assets/util/ cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server -backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc - CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \ - $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/ -ifneq ($(UPX),) - $(UPX) backend-assets/grpc/llama-ggml -endif - backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/ diff --git a/backend/go/llm/llama-ggml/llama.go b/backend/go/llm/llama-ggml/llama.go deleted file mode 100644 index 1a7add692a3d..000000000000 --- a/backend/go/llm/llama-ggml/llama.go +++ /dev/null @@ -1,204 +0,0 @@ -package main - -// This is a wrapper to statisfy the GRPC service interface -// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc) -import ( - "fmt" - - "github.com/go-skynet/go-llama.cpp" - "github.com/mudler/LocalAI/pkg/grpc/base" - pb "github.com/mudler/LocalAI/pkg/grpc/proto" -) - -type LLM struct { - base.SingleThread - - llama *llama.LLama -} - -func (llm *LLM) Load(opts *pb.ModelOptions) error { - ropeFreqBase := float32(10000) - ropeFreqScale := float32(1) - - if opts.RopeFreqBase != 0 { - ropeFreqBase = opts.RopeFreqBase - } - if opts.RopeFreqScale != 0 { - ropeFreqScale = opts.RopeFreqScale - } - - llamaOpts := []llama.ModelOption{ - llama.WithRopeFreqBase(ropeFreqBase), - llama.WithRopeFreqScale(ropeFreqScale), - } - - if opts.NGQA != 0 { - llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA))) - } - - if opts.RMSNormEps != 0 { - llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps)) - } - - if opts.ContextSize != 0 { - llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize))) - } - if opts.F16Memory { - llamaOpts = append(llamaOpts, llama.EnableF16Memory) - } - if opts.Embeddings { - llamaOpts = append(llamaOpts, llama.EnableEmbeddings) - } - if opts.NGPULayers != 0 { - llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers))) - } - - llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap)) - llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU)) - llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit)) - if opts.NBatch != 0 { - llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch))) - } else { - llamaOpts = append(llamaOpts, llama.SetNBatch(512)) - } - - if opts.NUMA { - llamaOpts = append(llamaOpts, llama.EnableNUMA) - } - - if opts.LowVRAM { - llamaOpts = append(llamaOpts, llama.EnabelLowVRAM) - } - - model, err := llama.New(opts.ModelFile, llamaOpts...) - llm.llama = model - - return err -} - -func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption { - ropeFreqBase := float32(10000) - ropeFreqScale := float32(1) - - if opts.RopeFreqBase != 0 { - ropeFreqBase = opts.RopeFreqBase - } - if opts.RopeFreqScale != 0 { - ropeFreqScale = opts.RopeFreqScale - } - predictOptions := []llama.PredictOption{ - llama.SetTemperature(opts.Temperature), - llama.SetTopP(opts.TopP), - llama.SetTopK(int(opts.TopK)), - llama.SetTokens(int(opts.Tokens)), - llama.SetThreads(int(opts.Threads)), - llama.WithGrammar(opts.Grammar), - llama.SetRopeFreqBase(ropeFreqBase), - llama.SetRopeFreqScale(ropeFreqScale), - llama.SetNegativePromptScale(opts.NegativePromptScale), - llama.SetNegativePrompt(opts.NegativePrompt), - } - - if opts.PromptCacheAll { - predictOptions = append(predictOptions, llama.EnablePromptCacheAll) - } - - if opts.PromptCacheRO { - predictOptions = append(predictOptions, llama.EnablePromptCacheRO) - } - - // Expected absolute path - if opts.PromptCachePath != "" { - predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath)) - } - - if opts.Mirostat != 0 { - predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat))) - } - - if opts.MirostatETA != 0 { - predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA)) - } - - if opts.MirostatTAU != 0 { - predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU)) - } - - if opts.Debug { - predictOptions = append(predictOptions, llama.Debug) - } - - predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...)) - - if opts.PresencePenalty != 0 { - predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty)) - } - - if opts.NKeep != 0 { - predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep))) - } - - if opts.Batch != 0 { - predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch))) - } - - if opts.F16KV { - predictOptions = append(predictOptions, llama.EnableF16KV) - } - - if opts.IgnoreEOS { - predictOptions = append(predictOptions, llama.IgnoreEOS) - } - - if opts.Seed != 0 { - predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed))) - } - - //predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed)) - - predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty)) - predictOptions = append(predictOptions, llama.SetMlock(opts.MLock)) - predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap)) - predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU)) - predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit)) - predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ)) - predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP)) - return predictOptions -} - -func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) { - return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...) -} - -func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error { - predictOptions := buildPredictOptions(opts) - - predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool { - results <- token - return true - })) - - go func() { - _, err := llm.llama.Predict(opts.Prompt, predictOptions...) - if err != nil { - fmt.Println("err: ", err) - } - close(results) - }() - - return nil -} - -func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) { - predictOptions := buildPredictOptions(opts) - - if len(opts.EmbeddingTokens) > 0 { - tokens := []int{} - for _, t := range opts.EmbeddingTokens { - tokens = append(tokens, int(t)) - } - return llm.llama.TokenEmbeddings(tokens, predictOptions...) - } - - return llm.llama.Embeddings(opts.Embeddings, predictOptions...) -} diff --git a/backend/go/llm/llama-ggml/main.go b/backend/go/llm/llama-ggml/main.go deleted file mode 100644 index 544771db91f1..000000000000 --- a/backend/go/llm/llama-ggml/main.go +++ /dev/null @@ -1,19 +0,0 @@ -package main - -import ( - "flag" - - grpc "github.com/mudler/LocalAI/pkg/grpc" -) - -var ( - addr = flag.String("addr", "localhost:50051", "the address to connect to") -) - -func main() { - flag.Parse() - - if err := grpc.StartServer(*addr, &LLM{}); err != nil { - panic(err) - } -} diff --git a/core/http/app_test.go b/core/http/app_test.go index ca7a2eaa55bb..ecaf6da3b0e6 100644 --- a/core/http/app_test.go +++ b/core/http/app_test.go @@ -526,77 +526,6 @@ var _ = Describe("API test", func() { Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this")) }) - It("runs openllama(llama-ggml backend)", Label("llama"), func() { - if runtime.GOOS != "linux" { - Skip("test supported only on linux") - } - response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ - URL: "github:go-skynet/model-gallery/openllama_3b.yaml", - Name: "openllama_3b", - Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128}, - }) - - Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response)) - - uuid := response["uuid"].(string) - - Eventually(func() bool { - response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid) - return response["processed"].(bool) - }, "360s", "10s").Should(Equal(true)) - - By("testing completion") - resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "}) - Expect(err).ToNot(HaveOccurred()) - Expect(len(resp.Choices)).To(Equal(1)) - Expect(resp.Choices[0].Text).To(ContainSubstring("five")) - - By("testing functions") - resp2, err := client.CreateChatCompletion( - context.TODO(), - openai.ChatCompletionRequest{ - Model: "openllama_3b", - Messages: []openai.ChatCompletionMessage{ - { - Role: "user", - Content: "What is the weather like in San Francisco (celsius)?", - }, - }, - Functions: []openai.FunctionDefinition{ - openai.FunctionDefinition{ - Name: "get_current_weather", - Description: "Get the current weather", - Parameters: jsonschema.Definition{ - Type: jsonschema.Object, - Properties: map[string]jsonschema.Definition{ - "location": { - Type: jsonschema.String, - Description: "The city and state, e.g. San Francisco, CA", - }, - "unit": { - Type: jsonschema.String, - Enum: []string{"celcius", "fahrenheit"}, - }, - }, - Required: []string{"location"}, - }, - }, - }, - }) - Expect(err).ToNot(HaveOccurred()) - Expect(len(resp2.Choices)).To(Equal(1)) - Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil()) - Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name) - - var res map[string]string - err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res) - Expect(err).ToNot(HaveOccurred()) - Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res)) - Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res)) - Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason)) - - }) - It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() { if runtime.GOOS != "linux" { Skip("test supported only on linux") diff --git a/docs/content/docs/features/text-generation.md b/docs/content/docs/features/text-generation.md index 11ab3999af9a..342b8e76284f 100644 --- a/docs/content/docs/features/text-generation.md +++ b/docs/content/docs/features/text-generation.md @@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a {{% alert note %}} -The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings. +The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. {{% /alert %}} @@ -175,25 +175,12 @@ name: llama backend: llama parameters: # Relative to the models path - model: file.gguf.bin -``` - -In the example above we specify `llama` as the backend to restrict loading `gguf` models only. - -For instance, to use the `llama-ggml` backend for `ggml` models: - -```yaml -name: llama -backend: llama-ggml -parameters: - # Relative to the models path - model: file.ggml.bin + model: file.gguf ``` #### Reference - [llama](https://github.com/ggerganov/llama.cpp) -- [binding](https://github.com/go-skynet/go-llama.cpp) ### exllama/2 diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index ace72fa3d342..5e465cf0ef8f 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -43,8 +43,6 @@ var TypeAlias map[string]string = map[string]string{ var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true" const ( - LlamaGGML = "llama-ggml" - LLamaCPP = "llama-cpp" LLamaCPPAVX2 = "llama-cpp-avx2" @@ -143,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) { // sets a priority list - first has more priority priorityList := []string{ - // First llama.cpp(variants) and llama-ggml to follow. + // First llama.cpp(variants) // We keep the fallback to prevent that if the llama.cpp variants // that depends on shared libs if breaks have still a safety net. - LLamaCPP, LlamaGGML, LLamaCPPFallback, + LLamaCPP, LLamaCPPFallback, } toTheEnd := []string{ From cc1f6f913f3c271cc2e73080991163b18ea03be0 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Feb 2025 19:39:59 +0100 Subject: [PATCH 2/9] fix(llama.cpp): disable mirostat as default (#2911) Even if increasing the quality of the output, it has shown to have performance drawbacks to be so noticeable that the confuses users about speed of LocalAI ( see also https://github.com/mudler/LocalAI/issues/2780 ). This changeset disables Mirostat by default (which can be still enabled manually). Signed-off-by: Ettore Di Giacinto Co-authored-by: Dave --- core/config/backend_config.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/config/backend_config.go b/core/config/backend_config.go index 8ce93d9f6e6f..2b130ec813a5 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -287,7 +287,8 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) { defaultTopP := 0.95 defaultTopK := 40 defaultTemp := 0.9 - defaultMirostat := 2 + // https://github.com/mudler/LocalAI/issues/2780 + defaultMirostat := 0 defaultMirostatTAU := 5.0 defaultMirostatETA := 0.1 defaultTypicalP := 1.0 From 731674eee7457642a042a043398d40e6cbf3e06a Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Thu, 6 Feb 2025 23:02:00 +0100 Subject: [PATCH 3/9] chore: :arrow_up: Update ggerganov/llama.cpp to `8a59053f63fffc24e730cd3ea067760abfe4a919` (#4776) :arrow_up: Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 790c6e6dca1b..a122403539ed 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ BINARY_NAME=local-ai DETECT_LIBS?=true # llama.cpp versions -CPPLLAMA_VERSION?=d774ab3acc4fee41fbed6dbfc192b57d5f79f34b +CPPLLAMA_VERSION?=8a59053f63fffc24e730cd3ea067760abfe4a919 # whisper.cpp version WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp From f670e0a91c788bde1c84d96958b3843d13f8f0f3 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 7 Feb 2025 13:29:53 +0100 Subject: [PATCH 4/9] chore(model gallery): add nohobby_l3.3-prikol-70b-v0.5 (#4777) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 4e75e71f618f..5bde3e85a69b 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -553,6 +553,29 @@ - filename: Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf sha256: d8d4874b837993546b750db3faf1c6e5d867883a6750f04f1f4986973d7c107b uri: huggingface://bartowski/Black-Ink-Guild_Pernicious_Prophecy_70B-GGUF/Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf +- !!merge <<: *llama33 + name: "nohobby_l3.3-prikol-70b-v0.5" + icon: https://files.catbox.moe/x9t3zo.png + urls: + - https://huggingface.co/Nohobby/L3.3-Prikol-70B-v0.5 + - https://huggingface.co/bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF + description: | + 99% of mergekit addicts quit before they hit it big. + + Gosh, I need to create an org for my test runs - my profile looks like a dumpster. + + What was it again? Ah, the new model. + + Exactly what I wanted. All I had to do was yank out the cursed official DeepSeek distill and here we are. + + From the brief tests it gave me some unusual takes on the character cards I'm used to. Just this makes it worth it imo. Also the writing is kinda nice. + overrides: + parameters: + model: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf + files: + - filename: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf + sha256: 36f29015f1f420f51569603445a3ea5fe72e3651c2022ef064086f5617578fe6 + uri: huggingface://bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF/Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf - &rwkv url: "github:mudler/LocalAI/gallery/rwkv.yaml@master" name: "rwkv-6-world-7b" From cc163429dc3ea027d9a6b6578757e942fcb62ce1 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 7 Feb 2025 13:31:49 +0100 Subject: [PATCH 5/9] chore(model gallery): add cognitivecomputations_dolphin3.0-r1-mistral-24b (#4778) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 5bde3e85a69b..5af8f895abf7 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -6913,6 +6913,22 @@ - filename: krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf sha256: 03aa6d1fb7ab70482a2242839b8d8e1c789aa90a8be415076ddf84bef65f06c7 uri: huggingface://bartowski/krutrim-ai-labs_Krutrim-2-instruct-GGUF/krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf +- !!merge <<: *mistral03 + name: "cognitivecomputations_dolphin3.0-r1-mistral-24b" + url: "github:mudler/LocalAI/gallery/chatml.yaml@master" + icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/hdAvdwZiJaLbGmvSZ3wTT.png + urls: + - https://huggingface.co/cognitivecomputations/Dolphin3.0-R1-Mistral-24B + - https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF + description: | + Dolphin 3.0 R1 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases. + overrides: + parameters: + model: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf + files: + - filename: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf + sha256: d67de1e94fb32742bd09ee8beebbeb36a4b544785a8f8413dc4d9490e04eda6c + uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf - &mudler url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models name: "LocalAI-llama3-8b-function-call-v0.2" From 230fe0098faeca88a6ab4ddcba8e70ce0794ea86 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 7 Feb 2025 13:33:24 +0100 Subject: [PATCH 6/9] chore(model gallery): add cognitivecomputations_dolphin3.0-mistral-24b (#4779) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 5af8f895abf7..3e0c1ac6423d 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -6929,6 +6929,22 @@ - filename: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf sha256: d67de1e94fb32742bd09ee8beebbeb36a4b544785a8f8413dc4d9490e04eda6c uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf +- !!merge <<: *mistral03 + name: "cognitivecomputations_dolphin3.0-mistral-24b" + url: "github:mudler/LocalAI/gallery/chatml.yaml@master" + icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/cNCs1TBD3FelWCJGkZ3cd.png + urls: + - https://huggingface.co/cognitivecomputations/Dolphin3.0-Mistral-24B + - https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF + description: | + Dolphin 3.0 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases. + overrides: + parameters: + model: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf + files: + - filename: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf + sha256: 6f193bbf98628140194df257c7466e2c6f80a7ef70a6ebae26c53b2f2ef21994 + uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf - &mudler url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models name: "LocalAI-llama3-8b-function-call-v0.2" From 4b1b942a7f747755fe3e45bead662eeb96db3959 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 8 Feb 2025 09:04:18 +0100 Subject: [PATCH 7/9] chore(model gallery): add sicariussicariistuff_redemption_wind_24b (#4781) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 3e0c1ac6423d..4b61a0e3b2dd 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -6945,6 +6945,28 @@ - filename: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf sha256: 6f193bbf98628140194df257c7466e2c6f80a7ef70a6ebae26c53b2f2ef21994 uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf +- !!merge <<: *mistral03 + name: "sicariussicariistuff_redemption_wind_24b" + url: "github:mudler/LocalAI/gallery/chatml.yaml@master" + icon: https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B/resolve/main/Images/Redemption_Wind_24B.png + urls: + - https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B + - https://huggingface.co/bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF + description: | + This is a lightly fine-tuned version of the Mistral 24B base model, designed as an accessible and adaptable foundation for further fine-tuning and merging fodder. Key modifications include: + ChatML-ified, with no additional tokens introduced. + High quality private instruct—not generated by ChatGPT or Claude, ensuring no slop and good markdown understanding. + No refusals—since it’s a base model, refusals should be minimal to non-existent, though, in early testing, occasional warnings still appear (I assume some were baked into the pre-train). + High-quality private creative writing dataset Mainly to dilute baked-in slop further, but it can actually write some stories, not bad for loss ~8. + Small, high-quality private RP dataset This was done so further tuning for RP will be easier. The dataset was kept small and contains ZERO SLOP, some entries are of 16k token length. + Exceptional adherence to character cards This was done to make it easier for further tunes intended for roleplay. + overrides: + parameters: + model: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf + files: + - filename: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf + sha256: 40025eb00d83c9e9393555962962a2dfc5251fe7bd70812835ff0bcc55ecc463 + uri: huggingface://bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF/SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf - &mudler url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models name: "LocalAI-llama3-8b-function-call-v0.2" From 7a5912908a6c8ae2791ddc6d5a733181ae02828a Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 8 Feb 2025 09:44:34 +0100 Subject: [PATCH 8/9] chore: :arrow_up: Update ggerganov/llama.cpp to `d2fe216fb2fb7ca8627618c9ea3a2e7886325780` (#4780) :arrow_up: Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a122403539ed..01d5a14ac9c5 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ BINARY_NAME=local-ai DETECT_LIBS?=true # llama.cpp versions -CPPLLAMA_VERSION?=8a59053f63fffc24e730cd3ea067760abfe4a919 +CPPLLAMA_VERSION?=d2fe216fb2fb7ca8627618c9ea3a2e7886325780 # whisper.cpp version WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp From e01acc88c984c60b5a3e60bb1e12d4e232a20f6c Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 8 Feb 2025 22:57:40 +0100 Subject: [PATCH 9/9] chore: :arrow_up: Update ggerganov/llama.cpp to `e6e658319952f7ad269dc11275b9edddc721fc6d` (#4787) :arrow_up: Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 01d5a14ac9c5..05fa4a44f548 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ BINARY_NAME=local-ai DETECT_LIBS?=true # llama.cpp versions -CPPLLAMA_VERSION?=d2fe216fb2fb7ca8627618c9ea3a2e7886325780 +CPPLLAMA_VERSION?=e6e658319952f7ad269dc11275b9edddc721fc6d # whisper.cpp version WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp