Skip to content

Commit 0419a27

Browse files
⚡ [#5084] Optimize FormVariable.objects.synchronize_for
There are a number of aspects to this performance patch. 1. Avoid database load The UPSERT query when sending all of the desired form variables leads to hefty queries (250ms for 1100 variables, 1100ms for ~7K). Splitting this up in work that can be ignore brings down the query duration to about 50-60ms for 1000 variables. 2. Avoid expensive formio definition processing By far the most expensive operation is scanning whether a component is inside an editgrid ("repeating group") through the utility component_in_editgrid, which was parsing the same configuration over and over again. Instead, we can use the already-existing flag of iter_components to not recurse into edit grids in the first place. This fixes most of the time spent in Python. 3. Replace deepcopy with shallow copy This is probably the most controversial one - when deepcopying a django model instance, it goes through all the motions to serialize it for picking, which means that it must figure out the reconstruction function to use and capture all the necessary data, and deepcopy recurses, which means it also goes into the related form_definition and thus the formio configuration object which is full of lists/dicts that are expensive to deep copy. The replacement with a shallow copy should be fine because: * we're not copying any existing database instances (pk=None) * all the kwargs in initial instantiation are calculated and provided explicitly, there is no reliance on mutation * when 'copying' the desired variables for each form, we assign the optimized form_id attribute and don't do any mutations, i.e. all operations remain on the shallow level This is covered with some tests to hopefully prevent future regressions. Other ideas considered: * don't store FormVariables in the database, but instead create them in memory on the fly. This will be a problem once we no longer store prefill configuration in the component, we will require actual DB instances. It's also not very intuitive * offload to celery again. This is what we initially patched as it was leading to race conditions and dead locks and general performance issues too. It's may also strangely affect existing submissions. Given the complexity and the performance gains, this was not further explored. On my local machine, this brings the worst case insert in the test (1000 form variables from a form definition with 1000 components) from 10+ seconds down to 400ms, so about a factor 25 improvement. Backport-of: #5085
1 parent cc5cca0 commit 0419a27

File tree

4 files changed

+463
-128
lines changed

4 files changed

+463
-128
lines changed

src/openforms/formio/utils.py

-16
Original file line numberDiff line numberDiff line change
@@ -177,22 +177,6 @@ def is_layout_component(component: Component) -> bool:
177177
return False
178178

179179

180-
def component_in_editgrid(configuration: dict, component: dict) -> bool:
181-
# Get all the editgrid components in the configuration
182-
editgrids = []
183-
for comp in iter_components(configuration=configuration):
184-
if comp["type"] == "editgrid":
185-
editgrids.append(comp)
186-
187-
# Check if the component is in the editgrid
188-
for editgrid in editgrids:
189-
for comp in iter_components(configuration=editgrid):
190-
if comp["key"] == component["key"]:
191-
return True
192-
193-
return False
194-
195-
196180
def get_component_datatype(component):
197181
component_type = component["type"]
198182
if component.get("multiple"):

src/openforms/forms/models/form_variable.py

+135-108
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
from __future__ import annotations
22

3-
from copy import deepcopy
4-
from typing import TYPE_CHECKING, ClassVar
3+
import itertools
4+
from copy import copy
5+
from typing import ClassVar
56

67
from django.db import models, transaction
78
from django.db.models import CheckConstraint, Q
89
from django.utils.translation import gettext_lazy as _
910

10-
from glom import Path, glom
11+
import elasticapm
1112

1213
from openforms.formio.utils import (
13-
component_in_editgrid,
1414
get_component_datatype,
1515
get_component_default_value,
1616
is_layout_component,
@@ -21,90 +21,42 @@
2121
from openforms.variables.constants import FormVariableDataTypes, FormVariableSources
2222
from openforms.variables.utils import check_initial_value
2323

24+
from .form import Form
2425
from .form_definition import FormDefinition
2526

26-
if TYPE_CHECKING:
27-
from .form import Form
28-
from .form_step import FormStep
29-
30-
3127
EMPTY_PREFILL_PLUGIN = Q(prefill_plugin="")
3228
EMPTY_PREFILL_ATTRIBUTE = Q(prefill_attribute="")
3329
EMPTY_PREFILL_OPTIONS = Q(prefill_options={})
3430
USER_DEFINED = Q(source=FormVariableSources.user_defined)
3531

32+
# these are the attributes that are derived from the matching formio component,
33+
# see FormVariableManager.synchronize_for. Other attributes are relational or
34+
# related to user defined variables (like service fetch, prefill options...).
35+
UPSERT_ATTRIBUTES_TO_COMPARE: tuple[str, ...] = (
36+
"prefill_plugin",
37+
"prefill_attribute",
38+
"prefill_identifier_role",
39+
"name",
40+
"is_sensitive_data",
41+
"data_type",
42+
"initial_value",
43+
)
44+
3645

3746
class FormVariableManager(models.Manager["FormVariable"]):
3847
use_in_migrations = True
3948

4049
@transaction.atomic
41-
def create_for_form(self, form: "Form") -> None:
42-
form_steps = form.formstep_set.select_related("form_definition")
50+
def create_for_form(self, form: Form) -> None:
51+
form_steps = form.formstep_set.select_related( # pyright: ignore[reportAttributeAccessIssue]
52+
"form_definition"
53+
)
4354

4455
for form_step in form_steps:
4556
self.synchronize_for(form_step.form_definition)
4657

47-
def create_for_formstep(self, form_step: FormStep) -> list[FormVariable]:
48-
form_definition_configuration = form_step.form_definition.configuration
49-
component_keys = [
50-
component["key"]
51-
for component in iter_components(
52-
configuration=form_definition_configuration, recursive=True
53-
)
54-
]
55-
existing_form_variables_keys = form_step.form.formvariable_set.filter(
56-
key__in=component_keys,
57-
form_definition=form_step.form_definition,
58-
).values_list("key", flat=True)
59-
60-
form_variables = []
61-
for component in iter_components(
62-
configuration=form_definition_configuration, recursive=True
63-
):
64-
if (
65-
is_layout_component(component)
66-
or component["type"] in ("content", "softRequiredErrors")
67-
or component["key"] in existing_form_variables_keys
68-
or component_in_editgrid(form_definition_configuration, component)
69-
):
70-
continue
71-
72-
form_variables.append(
73-
self.model(
74-
form=form_step.form,
75-
form_definition=form_step.form_definition,
76-
prefill_plugin=glom(
77-
component,
78-
Path("prefill", "plugin"),
79-
default="",
80-
skip_exc=KeyError,
81-
)
82-
or "",
83-
prefill_attribute=glom(
84-
component,
85-
Path("prefill", "attribute"),
86-
default="",
87-
skip_exc=KeyError,
88-
)
89-
or "",
90-
prefill_identifier_role=glom(
91-
component,
92-
Path("prefill", "identifierRole"),
93-
default=IdentifierRoles.main,
94-
skip_exc=KeyError,
95-
),
96-
key=component["key"],
97-
name=component.get("label") or component["key"],
98-
is_sensitive_data=component.get("isSensitiveData", False),
99-
source=FormVariableSources.component,
100-
data_type=get_component_datatype(component),
101-
initial_value=get_component_default_value(component),
102-
)
103-
)
104-
105-
return self.bulk_create(form_variables)
106-
107-
@transaction.atomic
58+
@elasticapm.capture_span(span_type="app.core.models")
59+
@transaction.atomic(savepoint=False)
10860
def synchronize_for(self, form_definition: FormDefinition):
10961
"""
11062
Synchronize the form variables for a given form definition.
@@ -120,32 +72,40 @@ def synchronize_for(self, form_definition: FormDefinition):
12072
"""
12173
# Build the desired state
12274
desired_variables: list[FormVariable] = []
75+
desired_keys: set[str] = set()
12376
# XXX: looping over the configuration_wrapper is not (yet) viable because it
12477
# also yields the components nested inside edit grids, which we need to ignore.
125-
# So, we stick to iter_components. Performance wise this should be okay since we
126-
# only need to do one pass.
78+
# So, we stick to iter_components. We deliberately do a single pass to process
79+
# the formio definition and then "copy" the information for each affected form
80+
# so that we can avoid excessive component tree processing.
12781
configuration = form_definition.configuration
128-
for component in iter_components(configuration=configuration, recursive=True):
82+
for component in iter_components(
83+
configuration=configuration,
84+
recursive=True,
85+
# components inside edit grids are not real variables
86+
recurse_into_editgrid=False,
87+
):
12988
# we need to ignore components that don't actually hold any values - there's
13089
# no point to create variables for those.
13190
if is_layout_component(component):
13291
continue
13392
if component["type"] in ("content", "softRequiredErrors"):
13493
continue
135-
if component_in_editgrid(configuration, component):
136-
continue
13794

13895
# extract options from the component
139-
prefill_plugin = glom(component, "prefill.plugin", default="") or ""
140-
prefill_attribute = glom(component, "prefill.attribute", default="") or ""
141-
prefill_identifier_role = glom(
142-
component, "prefill.identifierRole", default=IdentifierRoles.main
96+
prefill = component.get("prefill", {})
97+
prefill_plugin = prefill.get("plugin") or ""
98+
prefill_attribute = prefill.get("attribute") or ""
99+
prefill_identifier_role = (
100+
prefill.get("identifierRole") or IdentifierRoles.main
143101
)
144102

103+
key = component["key"]
104+
desired_keys.add(key)
145105
desired_variables.append(
146106
self.model(
147107
form=None, # will be set later when visiting all affected forms
148-
key=component["key"],
108+
key=key,
149109
form_definition=form_definition,
150110
prefill_plugin=prefill_plugin,
151111
prefill_attribute=prefill_attribute,
@@ -158,8 +118,6 @@ def synchronize_for(self, form_definition: FormDefinition):
158118
)
159119
)
160120

161-
desired_keys = [variable.key for variable in desired_variables]
162-
163121
# if the Formio configuration of the form definition itself is updated and
164122
# components have been removed or their keys have changed, we know for certain
165123
# we can discard those old form variables - it doesn't matter which form they
@@ -169,41 +127,81 @@ def synchronize_for(self, form_definition: FormDefinition):
169127
)
170128
stale_variables.delete()
171129

172-
# check which form (steps) are affected and patch them up. It is irrelevant whether
130+
# Check which forms are affected and patch them up.
131+
# It is irrelevant whether
173132
# the form definition is re-usable or not, though semantically at most one form step
174133
# should be found for single-use form definitions.
175-
# fmt: off
176-
affected_form_steps = (
177-
form_definition
178-
.formstep_set # pyright: ignore[reportAttributeAccessIssue]
179-
.select_related("form")
134+
affected_forms = (
135+
Form.objects.filter(formstep__form_definition=form_definition)
136+
.order_by("id")
137+
.distinct("id")
138+
.values_list("pk", flat=True)
139+
.iterator()
180140
)
181-
# fmt: on
182141

183-
# Finally, collect all the instances and efficiently upsert them - creating missing
142+
# Collect all the instances and efficiently upsert them - creating missing
184143
# variables and updating existing variables in a single query.
185144
to_upsert: list[FormVariable] = []
186-
for step in affected_form_steps:
145+
146+
# We check which form variables actually need to be updated or inserted. If
147+
# naively sending everything to the UPSERT we are sending pointless data to
148+
# Postgres which puts unnecessary load.
149+
existing_form_variables = (
150+
self.filter(form_definition=form_definition).order_by("form_id").iterator()
151+
)
152+
# keep track of (form_id, form_key) tuples that were considered, so that we can
153+
# efficiently decide whether we can ignore it or not based on existing variables.
154+
seen: set[tuple[int, str]] = set()
155+
156+
# first look at form variables that already exist since we can exclude those
157+
# from the upsert
158+
form_variables_by_form = itertools.groupby(
159+
existing_form_variables, key=lambda fv: fv.form_id
160+
)
161+
for form_id, variables in form_variables_by_form:
162+
assert form_id is not None
163+
variables_by_key = {variable.key: variable for variable in variables}
164+
165+
def _add_variable(variable: FormVariable):
166+
form_specific_variable = copy(variable)
167+
form_specific_variable.form_id = form_id
168+
to_upsert.append(form_specific_variable)
169+
170+
# check whether we need to create or update the variable by comparing against
171+
# existing variables.
172+
for desired_variable in desired_variables:
173+
existing_variable = variables_by_key.get(key := desired_variable.key)
174+
seen.add((form_id, key))
175+
176+
if existing_variable is None:
177+
_add_variable(desired_variable)
178+
continue
179+
180+
# otherwise, check if we need to update or can skip this variable to
181+
# make the upsert query smaller
182+
if not existing_variable.matches(desired_variable):
183+
# it needs to be updated
184+
_add_variable(desired_variable)
185+
186+
# Finally, process variables that don't exist yet at all
187+
for form_id in affected_forms:
187188
for variable in desired_variables:
188-
form_specific_variable = deepcopy(variable)
189-
form_specific_variable.form = step.form
189+
_lookup = (form_id, variable.key)
190+
# it already exists and has been processed
191+
if _lookup in seen:
192+
continue
193+
194+
# it doesn't exist and needs to be created
195+
form_specific_variable = copy(variable)
196+
form_specific_variable.form_id = form_id
190197
to_upsert.append(form_specific_variable)
191198

192199
self.bulk_create(
193200
to_upsert,
194201
# enables UPSERT behaviour so that existing records get updated and missing
195202
# records inserted
196203
update_conflicts=True,
197-
update_fields=(
198-
"prefill_plugin",
199-
"prefill_attribute",
200-
"prefill_identifier_role",
201-
"name",
202-
"is_sensitive_data",
203-
"source",
204-
"data_type",
205-
"initial_value",
206-
),
204+
update_fields=UPSERT_ATTRIBUTES_TO_COMPARE + ("source",),
207205
unique_fields=("form", "key"),
208206
)
209207

@@ -215,6 +213,7 @@ class FormVariable(models.Model):
215213
help_text=_("Form to which this variable is related"),
216214
on_delete=models.CASCADE,
217215
)
216+
form_id: int | None
218217
form_definition = models.ForeignKey(
219218
to=FormDefinition,
220219
verbose_name=_("form definition"),
@@ -357,6 +356,11 @@ class Meta:
357356
def __str__(self):
358357
return _("Form variable '{key}'").format(key=self.key)
359358

359+
def save(self, *args, **kwargs):
360+
self.check_data_type_and_initial_value()
361+
362+
super().save(*args, **kwargs)
363+
360364
def get_initial_value(self):
361365
return self.initial_value
362366

@@ -378,7 +382,30 @@ def check_data_type_and_initial_value(self):
378382
if self.source == FormVariableSources.user_defined:
379383
self.initial_value = check_initial_value(self.initial_value, self.data_type)
380384

381-
def save(self, *args, **kwargs):
382-
self.check_data_type_and_initial_value()
385+
def matches(self, other: FormVariable) -> bool:
386+
"""
387+
Check if the other form variable matches this instance.
383388
384-
super().save(*args, **kwargs)
389+
Matching can only be performed on component variables to check if they are
390+
still in sync. Foreign key relations to form etc. are ignored as this doesn't
391+
contain semantic information.
392+
"""
393+
assert (
394+
self.source == FormVariableSources.component
395+
), "Can only compare component variables"
396+
assert (
397+
other.source == FormVariableSources.component
398+
), "Can only compare component variables"
399+
assert self.key == other.key, (
400+
"Different keys are being compared, are you sure you're comparing "
401+
"the right instances?"
402+
)
403+
404+
# these are the attributes that are derived from the matching formio component,
405+
# see FormVariableManager.synchronize_for. Other attributes are relational or
406+
# related to user defined variables (like service fetch, prefill options...).
407+
all_equal = all(
408+
getattr(self, attr) == getattr(other, attr)
409+
for attr in UPSERT_ATTRIBUTES_TO_COMPARE
410+
)
411+
return all_equal

src/openforms/forms/tests/factories.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -188,14 +188,14 @@ def create(
188188
**kwargs,
189189
) -> FormStep:
190190
form_step = super().create(**kwargs)
191-
FormVariable.objects.create_for_formstep(form_step)
191+
FormVariable.objects.synchronize_for(form_step.form_definition)
192192
return form_step
193193

194194
@classmethod
195195
def _create(cls, *args, **kwargs):
196196
# This method is called instead of create() from the FormFactory with `generte_minimal_setup`
197197
form_step = super()._create(*args, **kwargs)
198-
FormVariable.objects.create_for_formstep(form_step)
198+
FormVariable.objects.synchronize_for(form_step.form_definition)
199199
return form_step
200200

201201

0 commit comments

Comments
 (0)