Skip to content

Commit

Permalink
C14 schema (#1182)
Browse files Browse the repository at this point in the history
* New example dir

* Continue extending the schema

* Prettier

* More progress

* Set -99999 as NR value for numeric columns

* Add auto schema check

* Fix indent

* Fix paths

* Fix paths

* Remove local path typo

* Update check_dataset.yml

* Try with target so can come from forks
  • Loading branch information
jfy133 authored Jan 24, 2025
1 parent ecb5333 commit 91278f0
Show file tree
Hide file tree
Showing 4 changed files with 561 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/check_dataset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ jobs:
run: |
echo "## Ancient Metagenome Environmental" >> validation/validation_results_raw.txt
AMDirT validate -s -d -c -m ancientmetagenome-environmental/libraries/ancientmetagenome-environmental_libraries.tsv ancientmetagenome-environmental/libraries/ancientmetagenome-environmental_libraries_schema.json &>> validation/validation_results_raw.txt
- name: RADIOCARBON DATES test ancient single genomes (e.g. pathogens)
if: always()
run: |
echo "# Radiocarbon Dates" >> validation/validation_results_raw.txt
echo "## Ancient Single Genome Host Associated" >> validation/validation_results_raw.txt
AMDirT validate -s -d -c -m ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates.tsv ancientsinglegenome-hostassociated/radiocarbondates/ancientsinglegenome-hostassociated_radiocarbondates_schema.json &>> validation/validation_results_raw.txt
- name: cleanup validation results from streamlit warnings
if: always()
run: |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
project_name publication_year data_publication_doi sample_name archive_project archive_sample_accession date_information_present date_is_radiocarbon multiple_dates reference_location reference_citation_depth primary_secondary_reference_citation_doi direct_dating radiocarbon_lab_code spectrometry_type sample_material delta_13c uncalibrated_date uncalibrated_uncertainty_plus_minus calibration_reported calibration_curve calibration_software calibration_software_version calibrated_range_lower calibrated_range_upper calibrated_range_median calibrated_range_suffix reservoir_offset_mentioned reservoir_offset_applied reservoir_offset_reported
AndradesValtuena2017 2017 10.1016/j.cub.2017.10.025 1343UnTal85 PRJEB19335 ERS1892067 true true false main text 2 10.1371/journal.pone.0139705 true MAMS-18949 AMS tooth -20.5 3819 24 true IntCal13 OxCal v4.2.24 4346 4098 NA cal AD true false NA
AndradesValtuena2017 2017 10.1016/j.cub.2017.10.025 6Post PRJEB19335 ERS1892066 true true false main text 2 10.1371/journal.pone.0139705 true MAMS-18955 AMS tooth -20.7 3574 19 true IntCal13 OxCal v4.2.24 3957 3832 NA cal AD true false NA
Spyrou2018 2018 10.1038/s41467-018-04550-9 RT5 PRJEB24296 ERS2106903 true true false main text 1 10.1038/s41467-018-04550-9 true MAMS-29430 NA tooth -99999 3517 27 true NR NR NR 3868 3704 NA cal BP false NA NA
Spyrou2018 2018 10.1038/s41467-018-04550-9 RT6 PRJEB24296 ERS2106904 true true false main text 1 10.1038/s41467-018-04550-9 true MAMS-29431 NA tooth -99999 3499 25 true NR NR NR 3842 3696 NA cal BP false NA NA
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://spaam-community.github.io/AncientMetagenomeDir/ancientsinglegenome-hostassociated/ancientsinglegenome-hostassociated_radiocarbondates_schema.json",
"type": "array",
"title": "JSON schema for AncientMetagenomeDir ancient host-associated single-genome C14 data",
"description": "The JSON schema for AncientMetagenomeDir ancient host-associated radiocarbon date (C14) information",
"additionalItems": false,
"items": {
"$id": "#/items",
"type": "object",
"title": "The items schema",
"description": "An explanation about the purpose of this instance.",
"default": {},
"required": [
"project_name",
"publication_year",
"data_publication_doi",
"sample_name",
"archive_project",
"archive_sample_accession"
],
"properties": {
"project_name": {
"$id": "#/items/properties/project_name",
"type": "string",
"title": "AncientMetagenomeDir key of the publication",
"description": "Format: surnameYYYY (if duplicate key but different publication, add b,c,d etc. as necessary). Must match a AncientMetagenomeDir samples table entry",
"pattern": "^[a-zA-Z]+\\d{4}[b-z]?$",
"examples": ["Warinner2014", "Muhlemann2018", "Muhlemann2018a"]
},
"publication_year": {
"$id": "#/items/properties/publication_year",
"type": "integer",
"minimum": 1950,
"maximum": 2100,
"title": "Year of publication",
"description": "Format: YYYY",
"examples": [2014]
},
"data_publication_doi": {
"$id": "#/items/properties/data_publication_doi",
"type": "string",
"pattern": "^10.\\d{4,9}\\/[^,]+$",
"title": "Digital Object Identifier (DOI) of the publication.",
"description": "A valid DOI code (not as an URL). Must match a AncientMetagenomeDir samples table entry",
"examples": ["10.1038/ng.2906"]
},
"sample_name": {
"$id": "#/items/properties/sample_name",
"type": "string",
"title": "Name of the sample",
"description": "In most cases this should be the name of the host individual. Must match a AncientMetagenomeDir samples table entry",
"examples": ["B61"]
},
"archive_project": {
"$id": "#/items/properties/archive_project",
"type": "string",
"title": "Archive project accession platform",
"description": "Name of the nucleotide data archiving platform. Must match a AncientMetagenomeDir samples table entry",
"examples": ["PRJNA438985", "mgp13354"]
},
"archive_sample_accession": {
"$id": "#/items/properties/archive_sample_accession",
"type": "string",
"pattern": "^[\\S]+$",
"title": "Archive accession number",
"description": "Samples archive accession numbers, multiple records can be separated with commas. No spaces allowed. . Must match a AncientMetagenomeDir samples table entry",
"examples": ["SRS473742,SRS473743,SRS473744,SRS473745"]
},
"date_information_present": {
"$id": "#/items/properties/date_information_present",
"type": "boolean",
"title": "Is Date Information Present?",
"description": "Whether a sample has a specific year-date reported (e.g. 1245, not 15th Century or similar), If false, all other downstream fields should be set to NA",
"examples": ["true", "false"]
},
"date_is_radiocarbon": {
"$id": "#/items/properties/date_is_radiocarbon",
"type": "string",
"title": "Is Date Radiocarbon?",
"enum": ["true", "false", "NA", "NR"],
"description": "Whether a sample has a specific year-date reported (e.g. 1245, not 15th Century or similar), If false, all other downstream fields should be set to NA",
"examples": ["true", "false"]
},
"multiple_dates": {
"$id": "#/items/properties/multiple_dates",
"type": "string",
"title": "Multiple Direct Dates Present?",
"enum": ["true", "false", "NA"],
"description": "Whether multiple (direct) dates are present for this sample; if so make multiple rows for the sample with one date per row.",
"examples": ["true", "false", "NA"]
},
"reference_location": {
"$id": "#/items/properties/reference_location",
"type": "string",
"title": "Location of Reference to Date",
"enum": ["main text", "supplement text", "supplement table"],
"description": "First place where the precise-radiocarbon date was recorded in the primary citation publication (i.e, the publication in ancientMetagenomeDir). main text > supplementary text > supplmentary table.",
"examples": ["main text", "supplement text", "supplement table", "NA"]
},
"reference_citation_depth": {
"$id": "#/items/properties/reference_citation_depth",
"type": "string",
"title": "Reference Citation Depth?",
"enum": ["main text", "supplement text", "supplement table"],
"description": "First place where the precise-radiocarbon date was recorded in the primary citation publication (i.e, the publication in ancientMetagenomeDir). main text > supplementary text > supplmentary table.",
"examples": ["1", "2", "3", "9", "NA"]
},
"primary_secondary_reference_citation_doi": {
"$id": "#/items/properties/primary_secondary_reference_citation_doi",
"type": "string",
"pattern": "^10.\\d{4,9}\\/[^,]+$",
"title": "Digital Object Identifier (DOI) of the publication that the date was originally reported.",
"description": "DOI of the primary or secondary reference (i.e. the DOI of the publication in which the date was original reported)",
"examples": ["10.1038/ng.2906"]
},
"direct_dating": {
"$id": "#/items/properties/direct_dating",
"type": "string",
"title": "Date Directly from Sample?",
"enum": ["true", "false", "NA"],
"description": "Whether the date of the ancient metagenomic sample was directly from the same skeleton (or similar), or whether inferred from other samples in the same context",
"examples": ["true", "false", "NA"]
},
"radiocarbon_lab_code": {
"$id": "#/items/properties/radiocarbon_lab_sample_id",
"type": "string",
"title": "Radiocarbon Lab Sample ID",
"$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/c14_lab_code.json",
"description": "Lab code of the date, from https://radiocarbon.webhost.uits.arizona.edu/laboratories from Labs-2023_02_17.pdf. NA is no date available, NR is date available but no lab code",
"examples": ["OxA", "ANAS", "Beta", "NR", "NA"]
},
"radiocarbon_lab_sample_id": {
"$id": "#/items/properties/radiocarbon_lab_sample_id",
"type": "integer",
"title": "Radiocarbon Lab Sample ID",
"description": "C14 sample code of the radiocarbon date from the lab",
"examples": ["12355", "44034"]
},
"spectrometry_type": {
"$id": "#/items/properties/spectrometry_type",
"type": "string",
"title": "Radiocarbon Lab Sample ID",
"$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/c14_lab_code.json",
"description": "Lab code of the date, from https://radiocarbon.webhost.uits.arizona.edu/laboratories from Labs-2023_02_17.pdf. NA is no date available, NR is date available but no lab code",
"examples": ["AMS", "IMRS", "NR", "NA"]
},
"material": {
"$id": "#/items/properties/material",
"type": "string",
"title": "Sample Material Used For Dating",
"$ref": "https://spaam-community.github.io/AncientMetagenomeDir/assets/enums/material.json",
"description": "Sample material used for extraction of e.g. collagen for generating the radiocarbon date",
"examples": ["both", "enamel"]
},
"delta_13c": {
"$id": "#/items/properties/delta_13c",
"type": "number",
"title": "δ13C value",
"description": "The δ13C value of the dating in ppm (‰). Not reported should be represented as -99999",
"examples": [-20.5, -17.6, -99999]
},
"uncalibrated_date ": {
"$id": "#/items/properties/delta_13c",
"type": "integer",
"minimum": 0,
"maximum": 50000,
"title": "Uncalibrated Date Year",
"description": "The uncalibrated date in calendar year date Before Present notation",
"examples": [934, 3960, 13000]
},
"uncalibrated_uncertainty_plus_minus": {
"$id": "#/items/properties/uncalibrated_uncertainty_plus_minus",
"type": "integer",
"title": "Uncalibrated Date Year Uncertainty",
"description": "Uncertainty value around uncalibrated date in calendar year date Before Present notation, typically indicated by ±",
"examples": [32, 5, 150]
},
"calibration_reported": {
"$id": "#/items/properties/calibration_reported",
"type": "boolean",
"title": "Is Calibration Reported?",
"description": "Whether the date has additionally been calibrated.",
"examples": ["true", "false"]
},
"calibration_curve": {
"$id": "#/items/properties/calibration_curve",
"type": "string",
"enum": ["IntCal20", "CalPal2007_HULU", "SHCal20", "Marine20", "NR"],
"title": "Calibration Curve",
"description": "The tree-ring calibration curve used for calibration."
},
"calibration_software": {
"$id": "#/items/properties/calibration_software",
"type": "string",
"enum": ["OxCal", "CalPal", "NR"],
"title": "Calibration Software",
"description": "Software used for radiocarbon calibration."
},
"calibration_software_version": {
"$id": "#/items/properties/calibration_software_version",
"type": "string",
"title": "Calibration Software",
"description": "Which version of the calibration software used, (set NR if not reported)",
"examples": ["v1.20", "0.35", "NR"]
},
"calibrated_range_lower": {
"$id": "#/items/properties/calibrated_range_lower",
"type": "integer",
"title": "Lower Date of Calibrated Date Range",
"description": "The lower range of the calibrated date",
"examples": ["1650"]
},
"calibrated_range_upper": {
"$id": "#/items/properties/calibrated_range_upper",
"type": "integer",
"title": "Upper Date of Calibrated Date Range",
"description": "The upper range of the calibrated date",
"examples": ["1450"]
},
"calibrated_range_median": {
"$id": "#/items/properties/calibrated_range_median",
"type": "integer",
"title": "Median Date of Calibrated Date Range",
"description": "The median date of the calibrated date range",
"examples": ["1550"]
},
"calibrated_range_suffix": {
"$id": "#/items/properties/calibrated_range_suffix",
"type": "string",
"enum": ["cal AD", "cal BC", "cal CE", "cal BCE", "cal BP"],
"title": "Suffix of the calibrated date range",
"description": "The suffix of the calibrated date range",
"examples": ["cal BP"]
},
"reservoir_offset_mentioned": {
"$id": "#/items/properties/reservoir_offset_mentioned",
"type": "boolean",
"title": "Is Reservoir Offset Mentioned?",
"description": "If radiocarbon C14 reservoir offset mentioned in any form. False here corresponds to not recorded (NR)",
"examples": ["true", "false"]
},
"reservoir_offset_applied": {
"$id": "#/items/properties/reservoir_offset_applied",
"type": "boolean",
"title": "Is Reservoir Offset Applied?",
"description": "If an offset correction or recalibration has been reported to have been applied ",
"examples": ["true", "false"]
},
"reservoir_offset_reported": {
"$id": "#/items/properties/reservoir_offset_reported",
"type": "integer",
"title": "Reservoir Offset Reported",
"description": "If the actual value of the offset has been reported (set NR if applied but actual value of offset not reported)",
"examples": [250, 400]
}
}
}
}
Loading

0 comments on commit 91278f0

Please sign in to comment.