🧵 [#3858] Fix concurrent session updates with multiple uploads

sergei-maertens · sergei-maertens · commit fad1f1ed5378 · 2024-02-12T18:03:36.000+01:00
Multiple uploads fire in parallel by the browser, which apparently causes the views/middleware to start processing the session data. This means that when an upload completes and it writes the upload UUID to the session, that each upload thinks there were no previous uploads, so it only writes itself to the session. The last completed upload is then the only one saved in the session data, and available for permission checks. Due to earlier uploads not being present in the session data, deleting those results in HTTP 403 errors, as we use the session in this case for access control. Starting multiple submissions (for different forms) would have had the same problem if they happen in quick succession, so we may have fixed another unreported problem. Note that updating the session here is very tricky code - however using some kind of "signed URL" is no proper solution either. We already use near-impossible to guess UUIDs in the URLs, but the risk remains the same: if you have the URL, you can read/delete potentially sensitive data that should only be available to the uploader. So, we *need* to maintain server-side state to guarantee this. As for the fix itself, I've opted to not solve this in a generic way that would allow concurrency on session data in general, the fact that Django doesn't handle this out of the box seems like good warning about the complexity involved in solving such a problem. In particular, tracking which changes are deliberate and which changes are stale does not seem feasible. Instead, a very localized solution is applied in problem areas known to suffer from concurrency issues - for which the bulk upload of files in the browser is probably the most common. The locking parameters have been guesstimated, taking into account current browser behaviour in amount of parallel requests they actually execute (typically 6). Local profiling shows that the actual lock acquiring is very fast. This makes redis a hard-dependency in the codebase, but we didn't really support alternative cache backends anyway. Backport-of: #3869
diff --git a/src/openforms/conf/base.py b/src/openforms/conf/base.py
@@ -112,6 +112,8 @@
             "IGNORE_EXCEPTIONS": True,
         },
     },
+    # TODO: rename to 'redis-locks' and get rid of portalocker in favour of plain
+    # redis locks?
     "portalocker": {
         "BACKEND": "django_redis.cache.RedisCache",
         "LOCATION": f"redis://{config('CACHE_PORTALOCKER', 'localhost:6379/0')}",
@@ -465,9 +467,11 @@
             "level": "DEBUG",
         },
         "log_outgoing_requests": {
-            "handlers": ["log_outgoing_requests", "save_outgoing_requests"]
-            if LOG_REQUESTS
-            else [],
+            "handlers": (
+                ["log_outgoing_requests", "save_outgoing_requests"]
+                if LOG_REQUESTS
+                else []
+            ),
             "level": "DEBUG",
             "propagate": True,
         },
diff --git a/src/openforms/formio/tests/test_api_fileupload.py b/src/openforms/formio/tests/test_api_fileupload.py
@@ -1,8 +1,10 @@
 import os
 import tempfile
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from unittest.mock import patch
 
+from django.conf import settings
 from django.core.files.uploadedfile import SimpleUploadedFile
 from django.test import override_settings, tag
 from django.utils.translation import gettext as _
@@ -11,7 +13,7 @@
 from privates.test import temp_private_root
 from rest_framework import status
 from rest_framework.reverse import reverse
-from rest_framework.test import APITestCase
+from rest_framework.test import APITestCase, APITransactionTestCase
 
 from openforms.config.models import GlobalConfiguration
 from openforms.submissions.attachments import temporary_upload_from_url
@@ -344,3 +346,53 @@ def test_cannot_connect_to_clamdav(self, m_config):
         tmpdir_contents = os.listdir(tmpdir)
 
         self.assertEqual(0, len(tmpdir_contents))
+
+
+@override_settings(
+    # Deliberately set to cache backend to not fall in the trap of using DB row-level
+    # locking. This also reflects how we deploy in prod.
+    SESSION_ENGINE="django.contrib.sessions.backends.cache",
+    SESSION_CACHE_ALIAS="session",
+    CACHES={
+        **settings.CACHES,
+        "session": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},
+    },
+)
+class ConcurrentUploadTests(SubmissionsMixin, APITransactionTestCase):
+
+    @tag("gh-3858")
+    def test_concurrent_file_uploads(self):
+        submission = SubmissionFactory.from_components(
+            [
+                {
+                    "type": "file",
+                    "key": "file",
+                    "label": "Some upload",
+                    "multiple": True,
+                }
+            ]
+        )
+        self._add_submission_to_session(submission)
+        endpoint = reverse("api:formio:temporary-file-upload")
+
+        def do_upload() -> str:
+            file = SimpleUploadedFile(
+                "my-file.txt", b"my content", content_type="text/plain"
+            )
+            response = self.client.post(endpoint, {"file": file}, format="multipart")
+            assert response.status_code == status.HTTP_200_OK
+            resp_data = response.json()
+            return resp_data["url"]
+
+        # do both uploads in parallel in their own thread
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            futures = [executor.submit(do_upload) for _ in range(0, 2)]
+            urls = [future.result() for future in as_completed(futures)]
+
+        uuids = {
+            url.removeprefix("http://testserver/api/v2/submissions/files/")
+            for url in urls
+        }
+
+        session_uuids = set(self.client.session[UPLOADS_SESSION_KEY])
+        self.assertEqual(session_uuids, uuids)
diff --git a/src/openforms/submissions/api/viewsets.py b/src/openforms/submissions/api/viewsets.py
@@ -172,7 +172,6 @@ def perform_create(self, serializer):
 
         # store the submission ID in the session, so that only the session owner can
         # mutate/view the submission
-        # note: possible race condition with concurrent requests
         add_submmission_to_session(serializer.instance, self.request.session)
 
         logevent.submission_start(serializer.instance)
diff --git a/src/openforms/submissions/utils.py b/src/openforms/submissions/utils.py
@@ -1,8 +1,10 @@
 import logging
+from contextlib import contextmanager
 from typing import Any, Union
 
 from django.conf import settings
 from django.contrib.sessions.backends.base import SessionBase
+from django.core.cache import caches
 from django.http import HttpRequest
 from django.utils import translation
 
@@ -37,23 +39,109 @@
 
 logger = logging.getLogger(__name__)
 
+# with the interval of 0.1s, this gives us 2.0 / 0.1 = 20 concurrent requests,
+# which is far above the typical browser concurrency mode (~6-8 requests).
+SESSION_LOCK_TIMEOUT_SECONDS = 2.0
+
+
+@contextmanager
+def _session_lock(session: SessionBase, key: str):
+    """
+    Helper to manage session data mutations for the specified key.
+
+    Concurrent session updates see stale data from when the request initially
+    got processed, so any added items from parallel requests is not taken into
+    account. This context manager refreshes the session data just-in-time and uses
+    a Redis distributed lock to synchronize access.
+
+    .. note:: this is pretty deep in Django internals, there doesn't appear to be a
+       public API for things like these :(
+    """
+    # only existing session have an existing key. If this is a new session, it hasn't
+    # been persisted to the backend yet, so there is also no possible race condition.
+    is_new = session.session_key is None
+    if is_new:
+        yield
+        return
+
+    # See TODO in settings about renaming this cache
+    redis_cache = caches["portalocker"]
+
+    # make the lock tied to the session itself, so that we don't affect other people's
+    # sessions.
+    cache_key = f"django:session-update:{session.session_key}"
+
+    # this is... tricky. To ensure we aren't still operating on stale data, we refresh
+    # the session data after acquiring a lock so that we're the only one that will be
+    # writing to it.
+    #
+    # For the locking interface, see redis-py :meth:`redis.client.Redis.lock`
+
+    logger.debug("Acquiring session lock for session %s", session.session_key)
+    with redis_cache.lock(
+        cache_key,
+        # max lifetime for the lock itself, must always be provided in case something
+        # crashes and we fail to call release
+        timeout=SESSION_LOCK_TIMEOUT_SECONDS,
+        # wait rather than failing immediately, we are trying to handle parallel
+        # requests here. Can't explicitly specify this, see
+        # https://github.com/jazzband/django-redis/issues/596. redis-py default is True.
+        # blocking=True,
+        # how long we can try to acquire the lock
+        blocking_timeout=SESSION_LOCK_TIMEOUT_SECONDS,
+    ):
+        logger.debug("Got session lock for session %s", session.session_key)
+        # nasty bit... the session itself can already be modified with *other*
+        # information that isn't relevant. So, we load the data from the storage again
+        # and only look at the provided key. If that one is different, we update our
+        # local data. We can not just reset to the result of session.load(), as that
+        # would discard modifications that should be persisted.
+        persisted_data = session.load()
+        if (data_slice := persisted_data.get(key)) != (current := session.get(key)):
+            logger.debug(
+                "Data from storage is different than what we currently have. "
+                "Session %s, key '%s' - in storage: %s, our view: %s",
+                session.session_key,
+                key,
+                data_slice,
+                current,
+            )
+            session[key] = data_slice
+            logger.debug(
+                "Updated key '%s' from storage for session %s", key, session.session_key
+            )
+
+        # execute the calling code and exit, clearing the lock.
+        yield
+
+        logger.debug(
+            "New session data for session %s is: %s",
+            session.session_key,
+            session._session,
+        )
+
+        # ensure we save in-between to persist the modifications, before the request
+        # may even be finished
+        session.save()
+        logger.debug("Saved session data for session %s", session.session_key)
+
 
 def append_to_session_list(session: SessionBase, session_key: str, value: Any) -> None:
-    # note: possible race condition with concurrent requests
-    active = session.get(session_key, [])
-    if value not in active:
-        active.append(value)
-        session[session_key] = active
+    with _session_lock(session, session_key):
+        active = session.get(session_key, [])
+        if value not in active:
+            active.append(value)
+            session[session_key] = active
 
 
 def remove_from_session_list(
     session: SessionBase, session_key: str, value: Any
 ) -> None:
-    # note: possible race condition with concurrent requests
-    active = session.get(session_key, [])
-    if value in active:
-        active.remove(value)
-        session[session_key] = active
+    with _session_lock(session, session_key):
+        active = session.get(session_key, [])
+        if value in active:
+            active.remove(value)
+            session[session_key] = active
 
 
 def add_submmission_to_session(submission: Submission, session: SessionBase) -> None: