Merge branch 'main' into ivarflakstad/metal-prng

huggingface · Jan 12, 2024 · e63bb86 · e63bb86
2 parents 87efb5d + 4191518
commit e63bb86
Show file tree

Hide file tree

Showing 53 changed files with 1,035 additions and 1,051 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+version: 2
+updates:
+  - package-ecosystem: "cargo"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    open-pull-requests-limit: 5
diff --git a/Cargo.toml b/Cargo.toml
@@ -31,9 +31,17 @@ license = "MIT OR Apache-2.0"
 accelerate-src = { version = "0.3.2" }
 anyhow = { version = "1", features = ["backtrace"] }
 byteorder = "1.4.3"
+candle = { path = "./candle-core", package = "candle-core" }
+candle-datasets = { path = "./candle-datasets" }
+candle-flash-attn = { path = "./candle-flash-attn" }
+candle-kernels = { path = "./candle-kernels" }
+candle-metal-kernels = { path = "./candle-metal-kernels" }
+candle-nn = { path = "./candle-nn" }
+candle-onnx = { path = "./candle-onnx" }
+candle-transformers = { path = "./candle-transformers" }
 clap = { version = "4.2.4", features = ["derive"] }
 criterion = { version = "0.5.1", default-features=false }
-cudarc = { version = "0.9.14", features = ["f16"] }
+cudarc = { version = "0.10.0", features = ["f16"] }
 gemm = { version = "0.16.6", features = ["wasm-simd128-enable"] }
 hf-hub = "0.3.0"
 half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
@@ -42,7 +50,7 @@ imageproc = { version = "0.23.0", default-features = false }
 intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
 libc = { version = "0.2.147" }
 log = "0.4"
-memmap2 = { version = "0.7.1", features = ["stable_deref_trait"] }
+memmap2 = { version = "0.9.3", features = ["stable_deref_trait"] }
 num_cpus = "1.15.0"
 num-traits = "0.2.15"
 parquet = { version = "45.0.0" }
@@ -55,7 +63,7 @@ serde = { version = "1.0.171", features = ["derive"] }
 serde_plain = "1.0.2"
 serde_json = "1.0.99"
 thiserror = "1"
-tokenizers = { version = "0.13.4", default-features = false }
+tokenizers = { version = "0.15.0", default-features = false }
 tracing = "0.1.37"
 tracing-chrome = "0.7.1"
 tracing-subscriber = "0.3.7"

diff --git a/candle-book/Cargo.toml b/candle-book/Cargo.toml
@@ -11,11 +11,11 @@ readme = "README.md"
 
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
-candle = { path = "../candle-core", version = "0.3.3", package = "candle-core" }
-candle-datasets = { path = "../candle-datasets", version = "0.3.3" }
-candle-nn = { path = "../candle-nn", version = "0.3.3" }
-candle-transformers = { path = "../candle-transformers", version = "0.3.3" }
-candle-flash-attn = { path = "../candle-flash-attn", version = "0.3.3", optional = true }
+candle = { workspace = true }
+candle-datasets = { workspace = true }
+candle-nn = { workspace = true }
+candle-transformers = { workspace = true }
+candle-flash-attn = { workspace = true, optional = true }
 safetensors = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }

diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
@@ -12,8 +12,8 @@ readme = "README.md"
 [dependencies]
 accelerate-src = { workspace = true, optional = true }
 byteorder = { workspace = true }
-candle-kernels = { path = "../candle-kernels", version = "0.3.3", optional = true }
-candle-metal-kernels = { path = "../candle-metal-kernels", version = "0.3.3", optional = true }
+candle-kernels = { workspace = true, optional = true }
+candle-metal-kernels = { workspace = true, optional = true }
 metal = { workspace = true, optional = true}
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }

diff --git a/candle-core/benches/benchmarks/matmul.rs b/candle-core/benches/benchmarks/matmul.rs
@@ -1,26 +1,25 @@
-use crate::benchmarks::{bench_name, device, BenchDevice};
-use candle_core::{DType, Tensor};
+use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
+use candle_core::{DType, Device, Tensor};
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
 
 fn run(a: &Tensor, b: &Tensor) {
     a.matmul(&b.t().unwrap()).unwrap();
 }
 
-fn criterion_benchmark(c: &mut Criterion) {
+fn run_bench(c: &mut Criterion, device: &Device) {
     let b = 1;
     let m = 1;
     let n = 2048;
     let k = 2048;
 
-    let device = device().unwrap();
     let dtype = DType::F32;
-    let lhs = Tensor::zeros((b, m, k), dtype, &device).unwrap();
-    let rhs = Tensor::zeros((b, n, k), dtype, &device).unwrap();
+    let lhs = Tensor::zeros((b, m, k), dtype, device).unwrap();
+    let rhs = Tensor::zeros((b, n, k), dtype, device).unwrap();
 
     let flops = b * m * n * k;
 
-    let mut group = c.benchmark_group(bench_name("matmul"));
+    let mut group = c.benchmark_group(device.bench_name("matmul"));
     group.throughput(Throughput::Bytes(flops as u64));
     group.bench_function("iter", move |b| {
         b.iter_custom(|iters| {
@@ -35,4 +34,11 @@ fn criterion_benchmark(c: &mut Criterion) {
     group.finish();
 }
 
+fn criterion_benchmark(c: &mut Criterion) {
+    let handler = BenchDeviceHandler::new().unwrap();
+    for device in handler.devices {
+        run_bench(c, &device);
+    }
+}
+
 criterion_group!(benches, criterion_benchmark);
diff --git a/candle-core/benches/benchmarks/mod.rs b/candle-core/benches/benchmarks/mod.rs
@@ -5,6 +5,8 @@ use candle_core::{Device, Result};
 
 pub(crate) trait BenchDevice {
     fn sync(&self) -> Result<()>;
+
+    fn bench_name<S: Into<String>>(&self, name: S) -> String;
 }
 
 impl BenchDevice for Device {
@@ -25,32 +27,38 @@ impl BenchDevice for Device {
             }
         }
     }
-}
 
-pub(crate) fn device() -> Result<Device> {
-    if cfg!(feature = "metal") {
-        Device::new_metal(0)
-    } else if cfg!(feature = "cuda") {
-        Device::new_cuda(0)
-    } else {
-        Ok(Device::Cpu)
+    fn bench_name<S: Into<String>>(&self, name: S) -> String {
+        match self {
+            Device::Cpu => {
+                let cpu_type = if cfg!(feature = "accelerate") {
+                    "accelerate"
+                } else if cfg!(feature = "mkl") {
+                    "mkl"
+                } else {
+                    "cpu"
+                };
+                format!("{}_{}", cpu_type, name.into())
+            }
+            Device::Cuda(_) => format!("cuda_{}", name.into()),
+            Device::Metal(_) => format!("metal_{}", name.into()),
+        }
     }
 }
 
-pub(crate) fn bench_name<S: Into<String>>(name: S) -> String {
-    format!("{}_{}", device_variant(), name.into())
+struct BenchDeviceHandler {
+    devices: Vec<Device>,
 }
 
-const fn device_variant() -> &'static str {
-    if cfg!(feature = "metal") {
-        "metal"
-    } else if cfg!(feature = "cuda") {
-        "cuda"
-    } else if cfg!(feature = "accelerate") {
-        "accelerate"
-    } else if cfg!(feature = "mkl") {
-        "mkl"
-    } else {
-        "cpu"
+impl BenchDeviceHandler {
+    pub fn new() -> Result<Self> {
+        let mut devices = Vec::new();
+        if cfg!(feature = "metal") {
+            devices.push(Device::new_metal(0)?);
+        } else if cfg!(feature = "cuda") {
+            devices.push(Device::new_cuda(0)?);
+        }
+        devices.push(Device::Cpu);
+        Ok(Self { devices })
     }
 }
diff --git a/candle-core/benches/benchmarks/random.rs b/candle-core/benches/benchmarks/random.rs
@@ -1,4 +1,4 @@
-use crate::benchmarks::{bench_name, device, BenchDevice};
+use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle_core::{DType, Device, Tensor};
 use criterion::{black_box, criterion_group, Criterion, Throughput};
 use std::time::Instant;
@@ -11,48 +11,53 @@ fn rand_normal(a: &Tensor) {
     a.randn_like(100.0, 15.0).unwrap();
 }
 
-fn criterion_benchmark(c: &mut Criterion) {
+fn run_random_bench(c: &mut Criterion, device: &Device) {
     let b = 1;
 
     let rows = 2048;
     let cols = 2048;
 
-    let d = device().unwrap();
     let dtype = DType::F32;
-    let tensor = Tensor::zeros((b, rows, cols), dtype, &d).unwrap();
+    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
 
     let flops = b * rows * cols * dtype.size_in_bytes();
 
-    let mut group = c.benchmark_group(bench_name("random_uniform"));
+    let mut group = c.benchmark_group(device.bench_name("random_uniform"));
     group.throughput(Throughput::Bytes(flops as u64));
     group.bench_function("iter", move |benches| {
         benches.iter_custom(|iters| {
             let start = Instant::now();
             for _i in 0..iters {
                 rand_uniform(black_box(&tensor));
             }
-            d.sync().unwrap();
+            device.sync().unwrap();
             start.elapsed()
         })
     });
     group.finish();
 
-    let d = device().unwrap();
-    let tensor = Tensor::zeros((b, rows, cols), dtype, &d).unwrap();
+    let tensor = Tensor::zeros((b, rows, cols), dtype, device).unwrap();
 
-    let mut group = c.benchmark_group(bench_name("random_normal"));
+    let mut group = c.benchmark_group(device.bench_name("random_normal"));
     group.throughput(Throughput::Bytes(flops as u64));
     group.bench_function("iter", move |benches| {
         benches.iter_custom(|iters| {
             let start = Instant::now();
             for _i in 0..iters {
                 rand_normal(black_box(&tensor));
             }
-            d.sync().unwrap();
+            device.sync().unwrap();
             start.elapsed()
         })
     });
     group.finish();
 }
 
+fn criterion_benchmark(c: &mut Criterion) {
+    let handler = BenchDeviceHandler::new().unwrap();
+    for device in handler.devices {
+        run_random_bench(c, &device);
+    }
+}
+
 criterion_group!(benches, criterion_benchmark);
diff --git a/candle-core/examples/tensor-tools.rs b/candle-core/examples/tensor-tools.rs
@@ -102,7 +102,7 @@ enum Command {
     },
 
     Quantize {
-        /// The input file, in gguf format.
+        /// The input file(s), in safetensors format.
         in_file: Vec<std::path::PathBuf>,
 
         /// The output file, in gguf format.
@@ -117,6 +117,15 @@ enum Command {
         #[arg(long, value_enum, default_value_t = QuantizationMode::Llama)]
         mode: QuantizationMode,
     },
+
+    Dequantize {
+        /// The input file, in gguf format.
+        in_file: std::path::PathBuf,
+
+        /// The output file, in safetensors format.
+        #[arg(long)]
+        out_file: std::path::PathBuf,
+    },
 }
 
 #[derive(Parser, Debug, Clone)]
@@ -285,6 +294,19 @@ fn run_quantize_safetensors(
     Ok(())
 }
 
+fn run_dequantize(in_file: std::path::PathBuf, out_file: std::path::PathBuf) -> Result<()> {
+    let mut in_file = std::fs::File::open(in_file)?;
+    let content = gguf_file::Content::read(&mut in_file)?;
+    let mut tensors = std::collections::HashMap::new();
+    for (tensor_name, _) in content.tensor_infos.iter() {
+        let tensor = content.tensor(&mut in_file, tensor_name)?;
+        let tensor = tensor.dequantize(&Device::Cpu)?;
+        tensors.insert(tensor_name.to_string(), tensor);
+    }
+    candle_core::safetensors::save(&tensors, out_file)?;
+    Ok(())
+}
+
 fn run_quantize(
     in_files: &[std::path::PathBuf],
     out_file: std::path::PathBuf,
@@ -379,6 +401,7 @@ fn main() -> anyhow::Result<()> {
             quantization,
             mode,
         } => run_quantize(&in_file, out_file, quantization, mode)?,
+        Command::Dequantize { in_file, out_file } => run_dequantize(in_file, out_file)?,
     }
     Ok(())
 }