diff --git a/batman-adv/Makefile b/batman-adv/Makefile index 219ca9f3a..70b9df1ee 100644 --- a/batman-adv/Makefile +++ b/batman-adv/Makefile @@ -4,7 +4,7 @@ include $(TOPDIR)/rules.mk PKG_NAME:=batman-adv PKG_VERSION:=2024.3 -PKG_RELEASE:=3 +PKG_RELEASE:=4 PKG_SOURCE:=$(PKG_NAME)-$(PKG_VERSION).tar.gz PKG_SOURCE_URL:=https://downloads.open-mesh.org/batman/releases/batman-adv-$(PKG_VERSION) diff --git a/batman-adv/patches/0006-batman-adv-fix-panic-during-interface-removal.patch b/batman-adv/patches/0006-batman-adv-fix-panic-during-interface-removal.patch new file mode 100644 index 000000000..09be28268 --- /dev/null +++ b/batman-adv/patches/0006-batman-adv-fix-panic-during-interface-removal.patch @@ -0,0 +1,71 @@ +From: Andy Strohman +Date: Thu, 9 Jan 2025 02:27:56 +0000 +Subject: batman-adv: fix panic during interface removal + +Reference counting is used to ensure that +batadv_hardif_neigh_node and batadv_hard_iface +are not freed before/during +batadv_v_elp_throughput_metric_update work is +finished. + +But there isn't a guarantee that the hard if will +remain associated with a soft interface up until +the work is finished. + +This fixes a crash triggered by reboot that looks +like this: + +Call trace: + batadv_v_mesh_free+0xd0/0x4dc [batman_adv] + batadv_v_elp_throughput_metric_update+0x1c/0xa4 + process_one_work+0x178/0x398 + worker_thread+0x2e8/0x4d0 + kthread+0xd8/0xdc + ret_from_fork+0x10/0x20 + +(the batadv_v_mesh_free call is misleading, +and does not actually happen) + +I was able to make the issue happen more reliably +by changing hardif_neigh->bat_v.metric_work work +to be delayed work. This allowed me to track down +and confirm the fix. + +Fixes: 5c3245172c01 ("batman-adv: ELP - compute the metric based on the estimated throughput") +Signed-off-by: Andy Strohman +[sven@narfation.org: prevent entering batadv_v_elp_get_throughput without + soft_iface] +Signed-off-by: Sven Eckelmann +Origin: upstream, https://git.open-mesh.org/batman-adv.git/commit/1d7c8ac629678b8cd23ef9def7c7b111cb9e8ed5 + +--- a/net/batman-adv/bat_v_elp.c ++++ b/net/batman-adv/bat_v_elp.c +@@ -66,12 +66,19 @@ static void batadv_v_elp_start_timer(str + static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) + { + struct batadv_hard_iface *hard_iface = neigh->if_incoming; ++ struct net_device *soft_iface = hard_iface->soft_iface; + struct ethtool_link_ksettings link_settings; + struct net_device *real_netdev; + struct station_info sinfo; + u32 throughput; + int ret; + ++ /* don't query throughput when no longer associated with any ++ * batman-adv interface ++ */ ++ if (!soft_iface) ++ return BATADV_THROUGHPUT_DEFAULT_VALUE; ++ + /* if the user specified a customised value for this interface, then + * return it directly + */ +@@ -141,7 +148,7 @@ static u32 batadv_v_elp_get_throughput(s + + default_throughput: + if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { +- batadv_info(hard_iface->soft_iface, ++ batadv_info(soft_iface, + "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", + hard_iface->net_dev->name, + BATADV_THROUGHPUT_DEFAULT_VALUE / 10, diff --git a/batman-adv/patches/0007-batman-adv-Ignore-neighbor-throughput-metrics-in-err.patch b/batman-adv/patches/0007-batman-adv-Ignore-neighbor-throughput-metrics-in-err.patch new file mode 100644 index 000000000..2ef52c934 --- /dev/null +++ b/batman-adv/patches/0007-batman-adv-Ignore-neighbor-throughput-metrics-in-err.patch @@ -0,0 +1,127 @@ +From: Sven Eckelmann +Date: Wed, 22 Jan 2025 21:51:20 +0100 +Subject: batman-adv: Ignore neighbor throughput metrics in error case + +If a temporary error happened in the evaluation of the neighbor throughput +information, then the invalid throughput result should not be stored in the +throughtput EWMA. + +Signed-off-by: Sven Eckelmann +Origin: upstream, https://git.open-mesh.org/batman-adv.git/commit/ce8b40b7bdcc7f011191acda4c2d3067566eef54 + +--- a/net/batman-adv/bat_v_elp.c ++++ b/net/batman-adv/bat_v_elp.c +@@ -59,11 +59,13 @@ static void batadv_v_elp_start_timer(str + /** + * batadv_v_elp_get_throughput() - get the throughput towards a neighbour + * @neigh: the neighbour for which the throughput has to be obtained ++ * @pthroughput: calculated throughput towards the given neighbour in multiples ++ * of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). + * +- * Return: The throughput towards the given neighbour in multiples of 100kpbs +- * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). ++ * Return: true when value behind @pthroughput was set + */ +-static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) ++static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, ++ u32 *pthroughput) + { + struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct net_device *soft_iface = hard_iface->soft_iface; +@@ -77,14 +79,16 @@ static u32 batadv_v_elp_get_throughput(s + * batman-adv interface + */ + if (!soft_iface) +- return BATADV_THROUGHPUT_DEFAULT_VALUE; ++ return false; + + /* if the user specified a customised value for this interface, then + * return it directly + */ + throughput = atomic_read(&hard_iface->bat_v.throughput_override); +- if (throughput != 0) +- return throughput; ++ if (throughput != 0) { ++ *pthroughput = throughput; ++ return true; ++ } + + /* if this is a wireless device, then ask its throughput through + * cfg80211 API +@@ -111,19 +115,24 @@ static u32 batadv_v_elp_get_throughput(s + * possible to delete this neighbor. For now set + * the throughput metric to 0. + */ +- return 0; ++ *pthroughput = 0; ++ return true; + } + if (ret) + goto default_throughput; + +- if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) +- return sinfo.expected_throughput / 100; ++ if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) { ++ *pthroughput = sinfo.expected_throughput / 100; ++ return true; ++ } + + /* try to estimate the expected throughput based on reported tx + * rates + */ +- if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) +- return cfg80211_calculate_bitrate(&sinfo.txrate) / 3; ++ if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { ++ *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3; ++ return true; ++ } + + goto default_throughput; + } +@@ -142,8 +151,10 @@ static u32 batadv_v_elp_get_throughput(s + hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; + + throughput = link_settings.base.speed; +- if (throughput && throughput != SPEED_UNKNOWN) +- return throughput * 10; ++ if (throughput && throughput != SPEED_UNKNOWN) { ++ *pthroughput = throughput * 10; ++ return true; ++ } + } + + default_throughput: +@@ -157,7 +168,8 @@ default_throughput: + } + + /* if none of the above cases apply, return the base_throughput */ +- return BATADV_THROUGHPUT_DEFAULT_VALUE; ++ *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE; ++ return true; + } + + /** +@@ -169,15 +181,21 @@ void batadv_v_elp_throughput_metric_upda + { + struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; + struct batadv_hardif_neigh_node *neigh; ++ u32 throughput; ++ bool valid; + + neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, + metric_work); + neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, + bat_v); + +- ewma_throughput_add(&neigh->bat_v.throughput, +- batadv_v_elp_get_throughput(neigh)); ++ valid = batadv_v_elp_get_throughput(neigh, &throughput); ++ if (!valid) ++ goto put_neigh; ++ ++ ewma_throughput_add(&neigh->bat_v.throughput, throughput); + ++put_neigh: + /* decrement refcounter to balance increment performed before scheduling + * this task + */ diff --git a/batman-adv/patches/0008-batman-adv-Drop-unmanaged-ELP-metric-worker.patch b/batman-adv/patches/0008-batman-adv-Drop-unmanaged-ELP-metric-worker.patch new file mode 100644 index 000000000..cc7d7d041 --- /dev/null +++ b/batman-adv/patches/0008-batman-adv-Drop-unmanaged-ELP-metric-worker.patch @@ -0,0 +1,239 @@ +From: Sven Eckelmann +Date: Wed, 22 Jan 2025 21:51:21 +0100 +Subject: batman-adv: Drop unmanaged ELP metric worker + +The ELP worker needs to calculate new metric values for all neighbors +"reachable" over an interface. Some of the used metric sources require +locks which might need to sleep. This sleep is incompatible with the RCU +list iterator used for the recorded neighbors. The initial approach to work +around of this problem was to queue another work item per neighbor and then +run this in a new context. + +Even when this solved the RCU vs might_sleep() conflict, it has a major +problems: Nothing was stopping the work item in case it is not needed +anymore - for example because one of the related interfaces was removed or +the batman-adv module was unloaded - resulting in potential invalid memory +accesses. + +Directly canceling the metric worker also has various problems: + +* cancel_work_sync for a to-be-deactivated interface is called with + rtnl_lock held. But the code in the ELP metric worker also tries to use + rtnl_lock() - which will never return in this case. This also means that + cancel_work_sync would never return because it is waiting for the worker + to finish. +* iterating over the neighbor list for the to-be-deactivated interface is + currently done using the RCU specific methods. Which means that it is + possible to miss items when iterating over it without the associated + spinlock - a behaviour which is acceptable for a periodic metric check + but not for a cleanup routine (which must "stop" all still running + workers) + +The better approch is to get rid of the per interface neighbor metric +worker and handle everything in the interface worker. The original problems +are solved by: + +* creating a list of neighbors which require new metric information inside + the RCU protected context, gathering the metric according to the new list + outside the RCU protected context +* only use rcu_trylock inside metric gathering code to avoid a deadlock + when the cancel_delayed_work_sync is called in the interface removal code + (which is called with the rtnl_lock held) + +Fixes: 5c3245172c01 ("batman-adv: ELP - compute the metric based on the estimated throughput") +Signed-off-by: Sven Eckelmann +Origin: upstream, https://git.open-mesh.org/batman-adv.git/commit/405d49a20a205799ec453cadb3d078dc2808d563 + +--- a/net/batman-adv/bat_v.c ++++ b/net/batman-adv/bat_v.c +@@ -113,8 +113,6 @@ static void + batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) + { + ewma_throughput_init(&hardif_neigh->bat_v.throughput); +- INIT_WORK(&hardif_neigh->bat_v.metric_work, +- batadv_v_elp_throughput_metric_update); + } + + /** +--- a/net/batman-adv/bat_v_elp.c ++++ b/net/batman-adv/bat_v_elp.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -26,6 +27,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -42,6 +44,18 @@ + #include "send.h" + + /** ++ * struct batadv_v_metric_queue_entry - list of hardif neighbors which require ++ * and metric update ++ */ ++struct batadv_v_metric_queue_entry { ++ /** @hardif_neigh: hardif neighbor scheduled for metric update */ ++ struct batadv_hardif_neigh_node *hardif_neigh; ++ ++ /** @list: list node for metric_queue */ ++ struct list_head list; ++}; ++ ++/** + * batadv_v_elp_start_timer() - restart timer for ELP periodic work + * @hard_iface: the interface for which the timer has to be reset + */ +@@ -137,10 +151,17 @@ static bool batadv_v_elp_get_throughput( + goto default_throughput; + } + ++ /* only use rtnl_trylock because the elp worker will be cancelled while ++ * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise ++ * wait forever when the elp work_item was started and it is then also ++ * trying to rtnl_lock ++ */ ++ if (!rtnl_trylock()) ++ return false; ++ + /* if not a wifi interface, check if this device provides data via + * ethtool (e.g. an Ethernet adapter) + */ +- rtnl_lock(); + ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); + rtnl_unlock(); + if (ret == 0) { +@@ -175,31 +196,19 @@ default_throughput: + /** + * batadv_v_elp_throughput_metric_update() - worker updating the throughput + * metric of a single hop neighbour +- * @work: the work queue item ++ * @neigh: the neighbour to probe + */ +-void batadv_v_elp_throughput_metric_update(struct work_struct *work) ++static void ++batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh) + { +- struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; +- struct batadv_hardif_neigh_node *neigh; + u32 throughput; + bool valid; + +- neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, +- metric_work); +- neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, +- bat_v); +- + valid = batadv_v_elp_get_throughput(neigh, &throughput); + if (!valid) +- goto put_neigh; ++ return; + + ewma_throughput_add(&neigh->bat_v.throughput, throughput); +- +-put_neigh: +- /* decrement refcounter to balance increment performed before scheduling +- * this task +- */ +- batadv_hardif_neigh_put(neigh); + } + + /** +@@ -273,14 +282,16 @@ batadv_v_elp_wifi_neigh_probe(struct bat + */ + static void batadv_v_elp_periodic_work(struct work_struct *work) + { ++ struct batadv_v_metric_queue_entry *metric_entry; ++ struct batadv_v_metric_queue_entry *metric_safe; + struct batadv_hardif_neigh_node *hardif_neigh; + struct batadv_hard_iface *hard_iface; + struct batadv_hard_iface_bat_v *bat_v; + struct batadv_elp_packet *elp_packet; ++ struct list_head metric_queue; + struct batadv_priv *bat_priv; + struct sk_buff *skb; + u32 elp_interval; +- bool ret; + + bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); + hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); +@@ -316,6 +327,8 @@ static void batadv_v_elp_periodic_work(s + + atomic_inc(&hard_iface->bat_v.elp_seqno); + ++ INIT_LIST_HEAD(&metric_queue); ++ + /* The throughput metric is updated on each sent packet. This way, if a + * node is dead and no longer sends packets, batman-adv is still able to + * react timely to its death. +@@ -340,16 +353,28 @@ static void batadv_v_elp_periodic_work(s + + /* Reading the estimated throughput from cfg80211 is a task that + * may sleep and that is not allowed in an rcu protected +- * context. Therefore schedule a task for that. ++ * context. Therefore add it to metric_queue and process it ++ * outside rcu protected context. + */ +- ret = queue_work(batadv_event_workqueue, +- &hardif_neigh->bat_v.metric_work); +- +- if (!ret) ++ metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC); ++ if (!metric_entry) { + batadv_hardif_neigh_put(hardif_neigh); ++ continue; ++ } ++ ++ metric_entry->hardif_neigh = hardif_neigh; ++ list_add(&metric_entry->list, &metric_queue); + } + rcu_read_unlock(); + ++ list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) { ++ batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh); ++ ++ batadv_hardif_neigh_put(metric_entry->hardif_neigh); ++ list_del(&metric_entry->list); ++ kfree(metric_entry); ++ } ++ + restart_timer: + batadv_v_elp_start_timer(hard_iface); + out: +--- a/net/batman-adv/bat_v_elp.h ++++ b/net/batman-adv/bat_v_elp.h +@@ -10,7 +10,6 @@ + #include "main.h" + + #include +-#include + + int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); + void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); +@@ -19,6 +18,5 @@ void batadv_v_elp_iface_activate(struct + void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); + int batadv_v_elp_packet_recv(struct sk_buff *skb, + struct batadv_hard_iface *if_incoming); +-void batadv_v_elp_throughput_metric_update(struct work_struct *work); + + #endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */ +--- a/net/batman-adv/types.h ++++ b/net/batman-adv/types.h +@@ -596,9 +596,6 @@ struct batadv_hardif_neigh_node_bat_v { + * neighbor + */ + unsigned long last_unicast_tx; +- +- /** @metric_work: work queue callback item for metric update */ +- struct work_struct metric_work; + }; + + /**