diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml
index 864d2f6a07..5b90f140c2 100644
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@@ -25,7 +25,7 @@ hf-hub = { workspace = true, features = ["tokio"] }
 image = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 num-traits = { workspace = true }
-pyo3 = { version = "0.20.0", features = ["auto-initialize"], optional = true }
+pyo3 = { version = "0.21.0", features = ["auto-initialize"], optional = true }
 rayon = { workspace = true }
 rubato = { version = "0.15.0", optional = true }
 safetensors = { workspace = true }
diff --git a/candle-pyo3/Cargo.toml b/candle-pyo3/Cargo.toml
index 7c6fbd689f..8800133429 100644
--- a/candle-pyo3/Cargo.toml
+++ b/candle-pyo3/Cargo.toml
@@ -20,10 +20,10 @@ candle-nn = { workspace = true }
 candle-onnx = { workspace = true, optional = true }
 half = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
-pyo3 = { version = "0.20.0", features = ["extension-module", "abi3-py38"] }
+pyo3 = { version = "0.21.0", features = ["extension-module", "abi3-py38"] }
 
 [build-dependencies]
-pyo3-build-config = "0.20"
+pyo3-build-config = "0.21"
 
 [features]
 default = []
diff --git a/candle-pyo3/py_src/candle/__init__.pyi b/candle-pyo3/py_src/candle/__init__.pyi
index b0f05de591..e418e15d77 100644
--- a/candle-pyo3/py_src/candle/__init__.pyi
+++ b/candle-pyo3/py_src/candle/__init__.pyi
@@ -26,21 +26,21 @@ class i64(DType):
     pass
 
 @staticmethod
-def ones(*shape: Shape, dtype: Optional[DType] = None, device: Optional[Device] = None) -> Tensor:
+def ones(shape: Sequence[int], dtype: Optional[DType] = None, device: Optional[Device] = None) -> Tensor:
     """
     Creates a new tensor filled with ones.
     """
     pass
 
 @staticmethod
-def rand(*shape: Shape, device: Optional[Device] = None) -> Tensor:
+def rand(shape: Sequence[int], device: Optional[Device] = None) -> Tensor:
     """
     Creates a new tensor with random values.
     """
     pass
 
 @staticmethod
-def randn(*shape: Shape, device: Optional[Device] = None) -> Tensor:
+def randn(shape: Sequence[int], device: Optional[Device] = None) -> Tensor:
     """
     Creates a new tensor with random values from a normal distribution.
     """
@@ -67,7 +67,7 @@ class u8(DType):
     pass
 
 @staticmethod
-def zeros(*shape: Shape, dtype: Optional[DType] = None, device: Optional[Device] = None) -> Tensor:
+def zeros(shape: Sequence[int], dtype: Optional[DType] = None, device: Optional[Device] = None) -> Tensor:
     """
     Creates a new tensor filled with zeros.
     """
@@ -208,12 +208,6 @@ class Tensor:
         """
         pass
 
-    def abs(self) -> Tensor:
-        """
-        Performs the `abs` operation on the tensor.
-        """
-        pass
-
     def argmax_keepdim(self, dim: int) -> Tensor:
         """
         Returns the indices of the maximum value(s) across the selected dimension.
@@ -232,7 +226,7 @@ class Tensor:
         """
         pass
 
-    def broadcast_as(self, *shape: Shape) -> Tensor:
+    def broadcast_as(self, shape: Sequence[int]) -> Tensor:
         """
         Broadcasts the tensor to the given shape.
         """
@@ -244,7 +238,7 @@ class Tensor:
         """
         pass
 
-    def broadcast_left(self, *shape: Shape) -> Tensor:
+    def broadcast_left(self, shape: Sequence[int]) -> Tensor:
         """
         Broadcasts the tensor to the given shape, adding new dimensions on the left.
         """
@@ -324,12 +318,6 @@ class Tensor:
         """
         pass
 
-    def gather(self, index, dim):
-        """
-        Gathers values along an axis specified by dim.
-        """
-        pass
-
     def get(self, index: int) -> Tensor:
         """
         Gets the value at the specified index.
@@ -397,13 +385,6 @@ class Tensor:
         """
         pass
 
-    @property
-    def nelement(self) -> int:
-        """
-        Gets the tensor's element count.
-        """
-        pass
-
     def powf(self, p: float) -> Tensor:
         """
         Performs the `pow` operation on the tensor with the given exponent.
@@ -429,7 +410,7 @@ class Tensor:
         """
         pass
 
-    def reshape(self, *shape: Shape) -> Tensor:
+    def reshape(self, shape: Sequence[int]) -> Tensor:
         """
         Reshapes the tensor to the given shape.
         """
@@ -491,12 +472,6 @@ class Tensor:
         """
         pass
 
-    def to(self, *args, **kwargs) -> Tensor:
-        """
-        Performs Tensor dtype and/or device conversion.
-        """
-        pass
-
     def to_device(self, device: Union[str, Device]) -> Tensor:
         """
         Move the tensor to a new device.
@@ -509,12 +484,6 @@ class Tensor:
         """
         pass
 
-    def to_torch(self) -> torch.Tensor:
-        """
-        Converts candle's tensor to pytorch's tensor
-        """
-        pass
-
     def transpose(self, dim1: int, dim2: int) -> Tensor:
         """
         Returns a tensor that is a transposed version of the input, the given dimensions are swapped.
diff --git a/candle-pyo3/py_src/candle/nn/__init__.pyi b/candle-pyo3/py_src/candle/nn/__init__.pyi
new file mode 100644
index 0000000000..118c4cff30
--- /dev/null
+++ b/candle-pyo3/py_src/candle/nn/__init__.pyi
@@ -0,0 +1,19 @@
+# Generated content DO NOT EDIT
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Sequence
+from os import PathLike
+from candle.typing import _ArrayLike, Device, Scalar, Index, Shape
+from candle import Tensor, DType, QTensor
+
+@staticmethod
+def silu(tensor: Tensor) -> Tensor:
+    """
+    Applies the Sigmoid Linear Unit (SiLU) function to a given tensor.
+    """
+    pass
+
+@staticmethod
+def softmax(tensor: Tensor, dim: int) -> Tensor:
+    """
+    Applies the Softmax function to a given tensor.#
+    """
+    pass
diff --git a/candle-pyo3/py_src/candle/utils/__init__.pyi b/candle-pyo3/py_src/candle/utils/__init__.pyi
index c9a9f9f3c1..4ee51c290b 100644
--- a/candle-pyo3/py_src/candle/utils/__init__.pyi
+++ b/candle-pyo3/py_src/candle/utils/__init__.pyi
@@ -33,9 +33,7 @@ def has_mkl() -> bool:
     pass
 
 @staticmethod
-def load_ggml(
-    path: Union[str, PathLike], device: Optional[Device] = None
-) -> Tuple[Dict[str, QTensor], Dict[str, Any], List[str]]:
+def load_ggml(path: Union[str, PathLike]) -> Tuple[Dict[str, QTensor], Dict[str, Any], List[str]]:
     """
     Load a GGML file. Returns a tuple of three objects: a dictionary mapping tensor names to tensors,
     a dictionary mapping hyperparameter names to hyperparameter values, and a vocabulary.
@@ -43,9 +41,7 @@ def load_ggml(
     pass
 
 @staticmethod
-def load_gguf(
-    path: Union[str, PathLike], device: Optional[Device] = None
-) -> Tuple[Dict[str, QTensor], Dict[str, Any]]:
+def load_gguf(path: Union[str, PathLike]) -> Tuple[Dict[str, QTensor], Dict[str, Any]]:
     """
     Loads a GGUF file. Returns a tuple of two dictionaries: the first maps tensor names to tensors,
     and the second maps metadata keys to metadata values.
diff --git a/candle-pyo3/src/lib.rs b/candle-pyo3/src/lib.rs
index e0d3bf300f..06d682cf68 100644
--- a/candle-pyo3/src/lib.rs
+++ b/candle-pyo3/src/lib.rs
@@ -60,8 +60,8 @@ impl PyDType {
 impl PyDType {
     fn from_pyobject(ob: PyObject, py: Python<'_>) -> PyResult<Self> {
         use std::str::FromStr;
-        if let Ok(dtype) = ob.extract::<&str>(py) {
-            let dtype = DType::from_str(dtype)
+        if let Ok(dtype) = ob.extract::<String>(py) {
+            let dtype = DType::from_str(&dtype)
                 .map_err(|_| PyTypeError::new_err(format!("invalid dtype '{dtype}'")))?;
             Ok(Self(dtype))
         } else {
@@ -116,8 +116,8 @@ impl PyDevice {
 
 impl<'source> FromPyObject<'source> for PyDevice {
     fn extract(ob: &'source PyAny) -> PyResult<Self> {
-        let device: &str = ob.extract()?;
-        let device = match device {
+        let device: String = ob.extract()?;
+        let device = match device.as_str() {
             "cpu" => PyDevice::Cpu,
             "cuda" => PyDevice::Cuda,
             _ => Err(PyTypeError::new_err(format!("invalid device '{device}'")))?,
@@ -265,7 +265,7 @@ impl PyTensor {
         } else if let Ok(TorchTensor(numpy)) = data.extract::<TorchTensor>(py) {
             return PyTensor::new(py, numpy);
         } else {
-            let ty = data.as_ref(py).get_type();
+            let ty = data.bind(py).get_type();
             Err(PyTypeError::new_err(format!(
                 "incorrect type {ty} for tensor"
             )))?
@@ -322,7 +322,7 @@ impl PyTensor {
     fn to_torch(&self, py: Python<'_>) -> PyResult<PyObject> {
         let candle_values = self.values(py)?;
         let torch_tensor: PyObject = py
-            .import("torch")?
+            .import_bound("torch")?
             .getattr("tensor")?
             .call1((candle_values,))?
             .extract()?;
@@ -333,7 +333,7 @@ impl PyTensor {
     /// Gets the tensor's shape.
     /// &RETURNS&: Tuple[int]
     fn shape(&self, py: Python<'_>) -> PyObject {
-        PyTuple::new(py, self.0.dims()).to_object(py)
+        PyTuple::new_bound(py, self.0.dims()).to_object(py)
     }
 
     #[getter]
@@ -347,7 +347,7 @@ impl PyTensor {
     /// Gets the tensor's strides.
     /// &RETURNS&: Tuple[int]
     fn stride(&self, py: Python<'_>) -> PyObject {
-        PyTuple::new(py, self.0.stride()).to_object(py)
+        PyTuple::new_bound(py, self.0.stride()).to_object(py)
     }
 
     #[getter]
@@ -527,7 +527,7 @@ impl PyTensor {
         }
 
         fn extract_indexer(
-            py_indexer: &PyAny,
+            py_indexer: &Bound<PyAny>,
             current_dim: usize,
             dims: &[usize],
             index_argument_count: usize,
@@ -567,7 +567,7 @@ impl PyTensor {
                     ),
                     current_dim + 1,
                 ))
-            } else if py_indexer.is_ellipsis() {
+            } else if py_indexer.is(&py_indexer.py().Ellipsis()) {
                 // Handle '...' e.g. tensor[..., 0]
                 if current_dim > 0 {
                     return Err(PyTypeError::new_err(
@@ -586,7 +586,7 @@ impl PyTensor {
             }
         }
 
-        if let Ok(tuple) = idx.downcast::<pyo3::types::PyTuple>(py) {
+        if let Ok(tuple) = idx.downcast_bound::<pyo3::types::PyTuple>(py) {
             let not_none_count: usize = tuple.iter().filter(|x| !x.is_none()).count();
 
             if not_none_count > dims.len() {
@@ -596,12 +596,12 @@ impl PyTensor {
             let mut current_dim = 0;
             for item in tuple.iter() {
                 let (indexer, new_current_dim) =
-                    extract_indexer(item, current_dim, dims, not_none_count)?;
+                    extract_indexer(&item, current_dim, dims, not_none_count)?;
                 current_dim = new_current_dim;
                 indexers.push(indexer);
             }
         } else {
-            let (indexer, _) = extract_indexer(idx.downcast::<PyAny>(py)?, 0, dims, 1)?;
+            let (indexer, _) = extract_indexer(idx.downcast_bound::<PyAny>(py)?, 0, dims, 1)?;
             indexers.push(indexer);
         }
 
@@ -652,7 +652,7 @@ impl PyTensor {
 
     /// Add two tensors.
     /// &RETURNS&: Tensor
-    fn __add__(&self, rhs: &PyAny) -> PyResult<Self> {
+    fn __add__(&self, rhs: &Bound<PyAny>) -> PyResult<Self> {
         let tensor = if let Ok(rhs) = rhs.extract::<Self>() {
             self.0.broadcast_add(&rhs.0).map_err(wrap_err)?
         } else if let Ok(rhs) = rhs.extract::<f64>() {
@@ -663,13 +663,13 @@ impl PyTensor {
         Ok(Self(tensor))
     }
 
-    fn __radd__(&self, rhs: &PyAny) -> PyResult<Self> {
+    fn __radd__(&self, rhs: &Bound<PyAny>) -> PyResult<Self> {
         self.__add__(rhs)
     }
 
     /// Multiply two tensors.
     /// &RETURNS&: Tensor
-    fn __mul__(&self, rhs: &PyAny) -> PyResult<Self> {
+    fn __mul__(&self, rhs: &Bound<PyAny>) -> PyResult<Self> {
         let tensor = if let Ok(rhs) = rhs.extract::<Self>() {
             self.0.broadcast_mul(&rhs.0).map_err(wrap_err)?
         } else if let Ok(rhs) = rhs.extract::<f64>() {
@@ -680,13 +680,13 @@ impl PyTensor {
         Ok(Self(tensor))
     }
 
-    fn __rmul__(&self, rhs: &PyAny) -> PyResult<Self> {
+    fn __rmul__(&self, rhs: &Bound<PyAny>) -> PyResult<Self> {
         self.__mul__(rhs)
     }
 
     /// Subtract two tensors.
     /// &RETURNS&: Tensor
-    fn __sub__(&self, rhs: &PyAny) -> PyResult<Self> {
+    fn __sub__(&self, rhs: &Bound<PyAny>) -> PyResult<Self> {
         let tensor = if let Ok(rhs) = rhs.extract::<Self>() {
             self.0.broadcast_sub(&rhs.0).map_err(wrap_err)?
         } else if let Ok(rhs) = rhs.extract::<f64>() {
@@ -699,7 +699,7 @@ impl PyTensor {
 
     /// Divide two tensors.
     /// &RETURNS&: Tensor
-    fn __truediv__(&self, rhs: &PyAny) -> PyResult<Self> {
+    fn __truediv__(&self, rhs: &Bound<PyAny>) -> PyResult<Self> {
         let tensor = if let Ok(rhs) = rhs.extract::<Self>() {
             self.0.broadcast_div(&rhs.0).map_err(wrap_err)?
         } else if let Ok(rhs) = rhs.extract::<f64>() {
@@ -711,7 +711,7 @@ impl PyTensor {
     }
     /// Rich-compare two tensors.
     /// &RETURNS&: Tensor
-    fn __richcmp__(&self, rhs: &PyAny, op: CompareOp) -> PyResult<Self> {
+    fn __richcmp__(&self, rhs: &Bound<PyAny>, op: CompareOp) -> PyResult<Self> {
         let compare = |lhs: &Tensor, rhs: &Tensor| {
             let t = match op {
                 CompareOp::Eq => lhs.eq(rhs),
@@ -957,7 +957,7 @@ impl PyTensor {
     #[pyo3(signature = (*args, **kwargs), text_signature = "(self, *args, **kwargs)")]
     /// Performs Tensor dtype and/or device conversion.
     /// &RETURNS&: Tensor
-    fn to(&self, args: &PyTuple, kwargs: Option<&PyDict>) -> PyResult<Self> {
+    fn to(&self, args: &Bound<PyTuple>, kwargs: Option<&Bound<PyDict>>) -> PyResult<Self> {
         let mut device: Option<PyDevice> = None;
         let mut dtype: Option<PyDType> = None;
         let mut other: Option<PyTensor> = None;
@@ -1227,7 +1227,7 @@ impl PyQTensor {
     ///Gets the shape of the tensor.
     /// &RETURNS&: Tuple[int]
     fn shape(&self, py: Python<'_>) -> PyObject {
-        PyTuple::new(py, self.0.shape().dims()).to_object(py)
+        PyTuple::new_bound(py, self.0.shape().dims()).to_object(py)
     }
 
     fn __repr__(&self) -> String {
@@ -1265,7 +1265,7 @@ fn load_safetensors(path: &str, py: Python<'_>) -> PyResult<PyObject> {
         .into_iter()
         .map(|(key, value)| (key, PyTensor(value).into_py(py)))
         .collect::<Vec<_>>();
-    Ok(res.into_py_dict(py).to_object(py))
+    Ok(res.into_py_dict_bound(py).to_object(py))
 }
 
 #[pyfunction]
@@ -1303,7 +1303,7 @@ fn load_ggml(
         .map(|(key, qtensor)| Ok((key, PyQTensor(Arc::new(qtensor)).into_py(py))))
         .collect::<::candle::Result<Vec<_>>>()
         .map_err(wrap_err)?;
-    let tensors = tensors.into_py_dict(py).to_object(py);
+    let tensors = tensors.into_py_dict_bound(py).to_object(py);
     let hparams = [
         ("n_vocab", ggml.hparams.n_vocab),
         ("n_embd", ggml.hparams.n_embd),
@@ -1313,7 +1313,7 @@ fn load_ggml(
         ("n_rot", ggml.hparams.n_rot),
         ("ftype", ggml.hparams.ftype),
     ];
-    let hparams = hparams.into_py_dict(py).to_object(py);
+    let hparams = hparams.into_py_dict_bound(py).to_object(py);
     let vocab = ggml
         .vocab
         .token_score_pairs
@@ -1351,7 +1351,7 @@ fn load_gguf(
             gguf_file::Value::Bool(x) => x.into_py(py),
             gguf_file::Value::String(x) => x.into_py(py),
             gguf_file::Value::Array(x) => {
-                let list = pyo3::types::PyList::empty(py);
+                let list = pyo3::types::PyList::empty_bound(py);
                 for elem in x.iter() {
                     list.append(gguf_value_to_pyobject(elem, py)?)?;
                 }
@@ -1371,13 +1371,13 @@ fn load_gguf(
         })
         .collect::<::candle::Result<Vec<_>>>()
         .map_err(wrap_err)?;
-    let tensors = tensors.into_py_dict(py).to_object(py);
+    let tensors = tensors.into_py_dict_bound(py).to_object(py);
     let metadata = gguf
         .metadata
         .iter()
         .map(|(key, value)| Ok((key, gguf_value_to_pyobject(value, py)?)))
         .collect::<PyResult<Vec<_>>>()?
-        .into_py_dict(py)
+        .into_py_dict_bound(py)
         .to_object(py);
     Ok((tensors, metadata))
 }
@@ -1390,7 +1390,7 @@ fn load_gguf(
 fn save_gguf(path: &str, tensors: PyObject, metadata: PyObject, py: Python<'_>) -> PyResult<()> {
     use ::candle::quantized::gguf_file;
 
-    fn pyobject_to_gguf_value(v: &PyAny, py: Python<'_>) -> PyResult<gguf_file::Value> {
+    fn pyobject_to_gguf_value(v: &Bound<PyAny>, py: Python<'_>) -> PyResult<gguf_file::Value> {
         let v: gguf_file::Value = if let Ok(x) = v.extract::<u8>() {
             gguf_file::Value::U8(x)
         } else if let Ok(x) = v.extract::<i8>() {
@@ -1418,7 +1418,7 @@ fn save_gguf(path: &str, tensors: PyObject, metadata: PyObject, py: Python<'_>)
         } else if let Ok(x) = v.extract::<Vec<PyObject>>() {
             let x = x
                 .into_iter()
-                .map(|f| pyobject_to_gguf_value(f.as_ref(py), py))
+                .map(|f| pyobject_to_gguf_value(f.bind(py), py))
                 .collect::<PyResult<Vec<_>>>()?;
             gguf_file::Value::Array(x)
         } else {
@@ -1450,7 +1450,7 @@ fn save_gguf(path: &str, tensors: PyObject, metadata: PyObject, py: Python<'_>)
             Ok((
                 key.extract::<String>()
                     .map_err(|_| PyErr::new::<PyValueError, _>("keys must be strings"))?,
-                pyobject_to_gguf_value(value, py)?,
+                pyobject_to_gguf_value(&value.as_borrowed(), py)?,
             ))
         })
         .collect::<PyResult<Vec<_>>>()?;
@@ -1498,7 +1498,7 @@ fn get_num_threads() -> usize {
     ::candle::utils::get_num_threads()
 }
 
-fn candle_utils(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
+fn candle_utils(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(cuda_is_available, m)?)?;
     m.add_function(wrap_pyfunction!(get_num_threads, m)?)?;
     m.add_function(wrap_pyfunction!(has_accelerate, m)?)?;
@@ -1579,7 +1579,7 @@ fn tanh(tensor: PyTensor) -> PyResult<PyTensor> {
     Ok(PyTensor(s))
 }
 
-fn candle_functional_m(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
+fn candle_functional_m(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(silu, m)?)?;
     m.add_function(wrap_pyfunction!(softmax, m)?)?;
     m.add_function(wrap_pyfunction!(max_pool2d, m)?)?;
@@ -1599,13 +1599,13 @@ fn candle_onnx_m(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
 }
 
 #[pymodule]
-fn candle(py: Python<'_>, m: &PyModule) -> PyResult<()> {
-    let utils = PyModule::new(py, "utils")?;
-    candle_utils(py, utils)?;
-    m.add_submodule(utils)?;
-    let nn = PyModule::new(py, "functional")?;
-    candle_functional_m(py, nn)?;
-    m.add_submodule(nn)?;
+fn candle(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
+    let utils = PyModule::new_bound(py, "utils")?;
+    candle_utils(py, &utils)?;
+    m.add_submodule(&utils)?;
+    let nn = PyModule::new_bound(py, "functional")?;
+    candle_functional_m(py, &nn)?;
+    m.add_submodule(&nn)?;
     #[cfg(feature = "onnx")]
     {
         let onnx = PyModule::new(py, "onnx")?;