From de824b4f3ab013f4c84f4cf0ca0401565f614602 Mon Sep 17 00:00:00 2001 From: Mike Kelly Date: Mon, 10 Feb 2025 08:17:27 +0000 Subject: [PATCH] Codelist create form: Check for `UnicodeDecodeError`. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise there can be an unhandled exception and 500 error if trying to upload a file encoded dfferently, with bad content, or if an XLSX is accidentally uploaded in place of CSV text. From Sentry reports, both cases seem fairly common. The text should be clear and actionable for all users, including those who might mistakenly upload an XLSX file. It should also provide enough detail to help more experienced users resolve issues with text encoding or file content. We shouldn't enforce file extension validation here. The problem isn't the filename; it's the file's encoding. We can process valid CSV data regardless of the .csv extension, which is just a convention. Adding this requirement could complicate matters unnecessarily, confuse users, or frustrate those who don’t follow the .csv naming convention. I considered checking the file extension to mention if it wasn't .csv, but it didn’t seem necessary. The error message should explain clearly what the user needs to do in either case. There were no tests of this form specifically so I added a basic valid form test as well as the test of the error case. codelists/tests/views/test_codelist_create.py has tests of the view as a whole but they seemed to relate to behaviour before or after the form is processed. --- opencodelists/forms.py | 12 +++++++-- opencodelists/tests/test_forms.py | 41 ++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/opencodelists/forms.py b/opencodelists/forms.py index e12ce7e6..5c33f58f 100644 --- a/opencodelists/forms.py +++ b/opencodelists/forms.py @@ -77,13 +77,21 @@ def clean_csv_data(self): if not f: return + try: + data = f.read().decode("utf-8-sig") + except UnicodeDecodeError as exception: + raise forms.ValidationError( + "File could not be read. Please ensure the file contains CSV " + "data (not Excel, for example). It should be a text file encoded " + f"in the UTF-8 format. Error details: {exception}." + ) + # Eventually coding system version may be a selectable field, but for now it - # just defaults to using the most recent one + # just defaults to using the most recent one. coding_system = CODING_SYSTEMS[ self.cleaned_data["coding_system_id"] ].get_by_release_or_most_recent() - data = f.read().decode("utf-8-sig") codes = [row[0] for row in csv.reader(StringIO(data)) if row] validate_csv_data_codes(coding_system, codes) return codes diff --git a/opencodelists/tests/test_forms.py b/opencodelists/tests/test_forms.py index c3cda63b..3afc2ef0 100644 --- a/opencodelists/tests/test_forms.py +++ b/opencodelists/tests/test_forms.py @@ -1,7 +1,8 @@ import pytest from django.core.exceptions import ValidationError +from django.core.files.base import ContentFile -from ..forms import MembershipCreateForm, UserPasswordForm +from ..forms import CodelistCreateForm, MembershipCreateForm, UserPasswordForm def test_userpasswordform_different_passwords(): @@ -48,3 +49,41 @@ def test_membership_create_form_no_user(organisation): ) assert form.is_valid() is False assert form.errors == {"user_idenitfier": ["User unknown@test.com does not exist"]} + + +class TestCodelistCreateForm: + def _bind_form(self, file): + return CodelistCreateForm( + data={ + "name": "test name", + "coding_system_id": "snomedct", + }, + files={ + "csv_data": file, + }, + owner_choices=[], + ) + + def test_bound_form_valid(self, disorder_of_elbow_codes): + valid_csv_data = "\n".join(disorder_of_elbow_codes) + file = ContentFile(valid_csv_data.encode("utf-8")) + file.name = "valid_data.csv" + form = self._bind_form(file) + + assert form.is_valid() + assert form.cleaned_data["name"] == "test name" + assert form.cleaned_data["coding_system_id"] == "snomedct" + assert form.cleaned_data["csv_data"] == disorder_of_elbow_codes + + def test_bound_form_invalid_file_unicode_error(self): + invalid_utf8_data = b"\xff\xfe\xfd" + file = ContentFile(invalid_utf8_data) + file.name = "invalid_data.csv" + form = self._bind_form(file) + + assert not form.is_valid() + errors = form.errors.get("csv_data") + len(errors) == 1 + assert ( + "File could not be read. Please ensure the file contains CSV" in errors[0] + )