diff --git a/bin/experiments/bench_explicit_return.pl b/bin/experiments/bench_explicit_return.pl new file mode 100644 index 000000000..0050321fa --- /dev/null +++ b/bin/experiments/bench_explicit_return.pl @@ -0,0 +1,28 @@ + +use Benchmark qw {:all}; +use 5.016; + +$| = 1; + + +cmpthese ( + -3, + { + er => sub {rand(); return}, + nr => sub {rand(); }, + } +); + + +__END__ + +# results: +50_000_000 + Rate er nr +er 20517029/s -- -45% +nr 37202381/s 81% -- + +-3 + Rate er nr +er 18205338/s -- -52% +nr 37690645/s 107% -- diff --git a/bin/experiments/bench_get_params.pl b/bin/experiments/bench_get_params.pl new file mode 100644 index 000000000..04e36f832 --- /dev/null +++ b/bin/experiments/bench_get_params.pl @@ -0,0 +1,47 @@ + +use Benchmark qw {:all}; +use 5.016; +use Data::Dumper; + +#use List::Util qw {:all}; + +my @vals = 1 .. 20; +my $self = {PARAMS => {label_hash1 => 5, @vals}}; + +#my $n = 1000; +#my @a1 = (0 .. $n); + +my $param = 'label_hash1'; + +cmpthese ( + 3000000, + { + old1 => sub {old1 ($self, $param)}, + new1 => sub {new1 ($self, $param)}, + new2 => sub {new2 ($self, $param)}, + new3 => sub {new3 ($self, $param)}, + } +); + + +#say Dumper $self; + +sub old1 { + return if ! exists $_[0]->{PARAMS}{$_[1]}; + return $_[0]->{PARAMS}{$_[1]}; +} + + +sub new1 { + no autovivification; + return $_[0]->{PARAMS}{$_[1]}; +} + +sub new2 { + no autovivification; + $_[0]->{PARAMS}{$_[1]}; +} + +sub new3 { + exists $_[0]->{PARAMS}{$_[1]} && $_[0]->{PARAMS}{$_[1]}; +} diff --git a/bin/experiments/bench_hash_key_set.pl b/bin/experiments/bench_hash_key_set.pl new file mode 100644 index 000000000..1bd53d641 --- /dev/null +++ b/bin/experiments/bench_hash_key_set.pl @@ -0,0 +1,105 @@ + +use Benchmark qw {:all}; +use 5.016; +use Data::Dumper; + +my @keys; +for (0..16000) { + push @keys, "$_:$_"; +} + +my @keys_outer = @keys[0..257]; + + +$| = 1; + +cmpthese ( + 3, + { + none => sub {no_set_keys()}, + outer => sub {set_keys_outer()}, + outer_init => sub {set_keys_outer_init()}, + inner => sub {set_keys_inner()}, + } +); + + +sub no_set_keys { + state $run_count; + $run_count ++; + say 'nsk ' . $run_count if !($run_count % 5); + my %hash; + foreach my $key1 (@keys_outer) { + foreach my $key2 (@keys) { + $hash{$key1}{$key2}++; + } + } + +} + +sub set_keys_outer { + state $run_count; + $run_count ++; + say 'sko ' . $run_count if !($run_count % 5); + + my %hash; + keys %hash = scalar @keys_outer; + foreach my $key1 (@keys_outer) { + foreach my $key2 (@keys) { + $hash{$key1}{$key2}++; + } + } +} + +sub set_keys_outer_init { + state $run_count; + $run_count ++; + say 'skoi ' . $run_count if !($run_count % 5); + + my %hash; + keys %hash = scalar @keys_outer; + foreach my $key1 (@keys_outer) { + $hash{$key1} //= {}; + foreach my $key2 (@keys) { + $hash{$key1}{$key2}++; + } + } +} + +sub set_keys_inner { + state $run_count; + $run_count ++; + say 'ski ' . $run_count if !($run_count % 5); + + my %hash; + keys %hash = scalar @keys_outer; + foreach my $key1 (@keys_outer) { + $hash{$key1} //= {}; + keys %{$hash{$key1}} = scalar @keys; + foreach my $key2 (@keys) { + $hash{$key1}{$key2}++; + } + } +} + +__END__ + +The differences are all in the noise. + +results on HPC with 5.20.0 using rand() as the keys: + + s/iter outer_init outer none inner +outer_init 5.17 -- -0% -0% -2% +outer 5.17 0% -- -0% -2% +none 5.17 0% 0% -- -2% +inner 5.07 2% 2% 2% -- + + +Small relative improvement when using "$_:$_" as the keys, +but the absolute values are also far less than for rand() keys: + + s/iter none outer outer_init inner +none 1.55 -- -1% -2% -5% +outer 1.53 1% -- -0% -4% +outer_init 1.52 2% 0% -- -4% +inner 1.47 6% 4% 4% -- diff --git a/bin/experiments/bench_hash_slice_vs_for_and_last.pl b/bin/experiments/bench_hash_slice_vs_for_and_last.pl new file mode 100644 index 000000000..68eec0a17 --- /dev/null +++ b/bin/experiments/bench_hash_slice_vs_for_and_last.pl @@ -0,0 +1,124 @@ +# Benchmark two approaches wihch could be used to get a total tree path length +# It is actually to do with a hash slice vs for-last approach +use 5.016; + +use Benchmark qw {:all}; +use List::Util qw /pairs pairkeys pairvalues pairmap/; +use Test::More; + +#srand (2000); + +my $n = 200; # depth of the paths +my $m = 80; # number of paths +my %path_arrays; # ordered key-value pairs +my %path_hashes; # unordered key-value pairs +my %len_hash; + +# generate a set of paths +foreach my $i (0 .. $m) { + my $same_to = int (rand() * $n/4); + my @a; + @a = map {(1+$m)*$i*$n+$_.'_', 1} (0 .. $same_to); + push @a, map {$_, 1} ($same_to+1 .. $n); + $path_arrays{$i} = \@a; + #say join ' ', @a; + my %hash = @a; + $path_hashes{$i} = \%hash; + + @len_hash{keys %hash} = values %hash; +} + + +my $sliced = slice (\%path_hashes); +my $forled = for_last (\%path_arrays); +my $slice2 = slice_mk2 (\%path_hashes); + +is_deeply ($forled, $sliced, 'slice results are the same'); +is_deeply ($forled, $slice2, 'slice2 results are the same'); + + +done_testing; + +say "Testing $m paths of depth $n"; +cmpthese ( + -2, + { + sliced => sub {slice (\%path_hashes)}, + slice2 => sub {slice_mk2 (\%path_hashes)}, + forled => sub {for_last (\%path_arrays)}, + } +); + + + +sub slice { + my $paths = shift; + + my %combined; + + foreach my $path (values %$paths) { + @combined{keys %$path} = values %$path; + } + + return \%combined; +} + +# assign values at end +sub slice_mk2 { + my $paths = shift; + + my %combined; + + foreach my $path (values %$paths) { + @combined{keys %$path} = undef; + } + + @combined{keys %combined} = @len_hash{keys %combined}; + + return \%combined; +} + +sub for_last { + my $paths = shift; + + + my @keys = keys %$paths; + my $first = shift @keys; + my $first_list = $paths->{$first}; + + # initialise + my %combined; + @combined{pairkeys @$first_list} = pairvalues @$first_list; + + foreach my $list (values %$paths) { + foreach my $pair (pairs @$list) { + my ($key, $val) = @$pair; + last if exists $combined{$key}; + $combined{$key} = $val; + } + } + + return \%combined; +} + +1; + +__END__ + +Sample results below. +Some runs have no meaningful difference from sliced to slice2, +but slice2 is always faster (even if only 2%). +Normally it is ~15% faster. + + +Testing 800 paths of depth 20 + Rate forled sliced slice2 +forled 59.2/s -- -69% -73% +sliced 192/s 225% -- -13% +slice2 222/s 275% 15% -- + +Testing 800 paths of depth 20 + Rate forled sliced slice2 +forled 49.5/s -- -74% -77% +sliced 194/s 292% -- -12% +slice2 219/s 342% 13% -- diff --git a/bin/experiments/bench_or_vs_plus.pl b/bin/experiments/bench_or_vs_plus.pl new file mode 100644 index 000000000..6558dfb68 --- /dev/null +++ b/bin/experiments/bench_or_vs_plus.pl @@ -0,0 +1,52 @@ +use 5.010; +use Test::More; +use Benchmark qw {:all}; +#use List::Util qw {:all}; + +my $a = 1; +my $b = rand(); +my $c = rand(); + +for $a (0, rand(), 1, 2000) { + my $result_or = !!use_or(); + my $result_or2 = !!use_or2(); + my $result_plus = !!use_plus(); + my $result_factored = !!factored(); + + is ($result_or, $result_or2, 'use_or2()'); + is ($result_or, $result_plus, 'use_plus()'); + is ($result_or, $result_factored, 'factored()'); + + say "$a: $result_or, $result_or2, $result_plus, $result_factored"; + + cmpthese ( + 5000000, + { + or => sub {use_or ()}, + or2 => sub {use_or2 ()}, + plus => sub {use_plus ()}, + factored => sub {factored ()}, + } + ); + +} + +done_testing(); + + +sub use_or { + my $x = $a || $b and $a || $c; +} + +sub use_or2 { + my $x = ($a || $b) && ($a || $c); +} + +sub factored { + my $x = $a || ($b && $c); +} + +sub use_plus { + my $x = $a + $b and $a + $c; +} + diff --git a/bin/experiments/bench_reduce_v_for.pl b/bin/experiments/bench_reduce_v_for.pl new file mode 100644 index 000000000..11e1a46ee --- /dev/null +++ b/bin/experiments/bench_reduce_v_for.pl @@ -0,0 +1,48 @@ +use 5.010; +use Benchmark qw {:all}; +use List::Util qw /reduce/; +use Data::Dumper; + +my $n = 1000; +my $min = -500; +my @a1 = ($min .. ($min + $n)); + +my @data; +for my $i (0 .. $#a1) { + push @data, [$a1[$i], $a1[-($i+1)]]; +} + +say Dumper sub1(\@data); +say Dumper sub2(\@data); + + + +cmpthese ( + -5, + { + reduce => sub {sub2 ()}, + foreach => sub {sub1 ()}, + } +); + + +sub sub2 { + my $pairs = shift; + + my $first = reduce { ($a->[0] cmp $b->[0] || $a->[1] cmp $b->[1]) < 0 ? $b : $a} @$pairs; + + return $first; +} + +sub sub1 { + my $pairs = shift; + + my $first = $pairs->[0]; + foreach my $pair (@$pairs) { + if (($first->[0] cmp $pair->[0] || $first->[1] cmp $pair->[1]) < 0) { + $first = $pair; + } + } + + return $first; +} diff --git a/bin/experiments/bench_ref_reftype.pl b/bin/experiments/bench_ref_reftype.pl new file mode 100644 index 000000000..4251c6a67 --- /dev/null +++ b/bin/experiments/bench_ref_reftype.pl @@ -0,0 +1,31 @@ + +use Benchmark qw {:all}; +use 5.016; +use Data::Dumper; +use Scalar::Util qw /reftype/; + +#use List::Util qw {:all}; + +my @vals = 1 .. 20; +my $self = {PARAMS => {label_hash1 => 5, @vals}}; + +#my $n = 1000; +#my @a1 = (0 .. $n); + +my $param = 'label_hash1'; + +$| = 1; + +#$self = []; + +say (((ref $self) =~ /HASH/) ? 1 : 0); +say (reftype ($self) eq 'HASH' ? 1 : 0); + + +cmpthese ( + -3, + { + old1 => sub {(ref $self) =~ /HASH/}, + new1 => sub {reftype ($self) eq 'HASH'}, + } +); diff --git a/bin/experiments/bench_wantarray.pl b/bin/experiments/bench_wantarray.pl new file mode 100644 index 000000000..bcc7f785e --- /dev/null +++ b/bin/experiments/bench_wantarray.pl @@ -0,0 +1,50 @@ + +use Benchmark qw {:all}; +use 5.016; +use Data::Dumper; +use Scalar::Util qw /reftype/; + +#use List::Util qw {:all}; + +my @vals = 1 .. 200; +my $self = {PARAMS => {label_hash1 => 5, @vals}}; + +#my $n = 1000; +#my @a1 = (0 .. $n); + +my $param = 'label_hash1'; + +$| = 1; + + +cmpthese ( + -3, + { + wa => sub {scalar use_wantarray()}, + hr => sub {scalar return_hashref()}, + #h => sub {scalar return_hash()}, + b => sub {scalar bare()}, + bh => sub {scalar bare_hashref()}, + } +); + + +sub use_wantarray { + return wantarray ? %$self : $self; +} + +sub return_hashref { + return $self; +} + +sub return_hash { + return %$self; +} + +sub bare { + wantarray ? %$self : $self; +} + +sub bare_hashref { + $self; +} \ No newline at end of file diff --git a/bin/experiments/export_geotiff.pl b/bin/experiments/export_geotiff.pl new file mode 100644 index 000000000..5e1f1af2e --- /dev/null +++ b/bin/experiments/export_geotiff.pl @@ -0,0 +1,76 @@ + +use 5.016; + +use FindBin; +use Path::Class; +use rlib "../../lib"; + +use Biodiverse::Config; + +use Geo::GDAL; +use Biodiverse::BaseData; + + + +my $data_dir = Path::Class::Dir->new($FindBin::Bin, '..', '..', 'data')->stringify; + +my $bd_file = Path::Class::File->new($data_dir, 'example_data_x64.bds')->stringify; + +my $bd = Biodiverse::BaseData->new (file => $bd_file); + +my $sp = $bd->add_spatial_output(name => 'blah'); +$sp->run_analysis ( + calculations => ['calc_richness'], + spatial_conditions => ['sp_self_only()'], +); + + +#my $export_type = 'ArcInfo floatgrid files'; +my $export_type = 'GeoTIFF'; +my $filename = 'xx.tiff'; +my $list_name = 'SPATIAL_RESULTS'; + +$sp->export ( + format => $export_type, + file => $filename, + list => $list_name, +); + + + + +#my $format = "GTiff"; +#my $driver = Geo::GDAL::GetDriverByName( $format ); +# +#my $metadata = $driver->GetMetadata(); +# +#if (exists $metadata->{DCAP_CREATE} && $metadata->{DCAP_CREATE} eq 'YES') { +# say "Driver for $metadata->{DMD_LONGNAME} supports Create() method."; +#} +# +#if (exists $metadata->{DCAP_CREATECOPY} && $metadata->{DCAP_CREATECOPY} eq 'YES') { +# say "Driver $metadata->{DMD_LONGNAME} supports CreateCopy() method."; +#} +# +#my $n = 100; +#my @data; +#my $pdata; +#for my $i (reverse (0 .. $n)) { +# for my $j (0 .. $n) { +# my $value = $i * $n + $j; +# $data[$i][$j] = $value; +# $pdata .= pack ('f', $value); +# } +#} +# +# +#my $f_name = 'test5.tif'; +#my ($cols, $rows) = ($n+1, $n+1); +# +#my $out_raster = $driver->Create($f_name, $cols, $rows, 1, 'Float32'); +# +#my $outband = $out_raster->GetRasterBand(1); +#$outband->WriteRaster(0, 0, $rows, $cols, $pdata); +# +# +# diff --git a/lib/Biodiverse/BaseData.pm b/lib/Biodiverse/BaseData.pm index 609fc5741..fc3d4785b 100644 --- a/lib/Biodiverse/BaseData.pm +++ b/lib/Biodiverse/BaseData.pm @@ -3186,7 +3186,7 @@ sub delete_groups { } foreach my $element (@$elements) { - $self->delete_element (type => 'GROUP', element => $element); + $self->delete_element (type => 'GROUPS', element => $element); } return; diff --git a/lib/Biodiverse/Indices.pm b/lib/Biodiverse/Indices.pm index 0a1c19297..f265e58a4 100644 --- a/lib/Biodiverse/Indices.pm +++ b/lib/Biodiverse/Indices.pm @@ -37,7 +37,8 @@ use parent qw { Biodiverse::Indices::LabelPropertiesRangeWtd Biodiverse::Indices::GroupProperties Biodiverse::Indices::LabelCounts - Biodiverse::Indices::RWTurnover + Biodiverse::Indices::RWTurnover + Biodiverse::Indices::RichnessEstimation Biodiverse::Common }; diff --git a/lib/Biodiverse/Indices/Indices.pm b/lib/Biodiverse/Indices/Indices.pm index 9247983a8..cf3c9ff86 100644 --- a/lib/Biodiverse/Indices/Indices.pm +++ b/lib/Biodiverse/Indices/Indices.pm @@ -1455,7 +1455,7 @@ sub get_metadata_calc_elements_used { }, EL_COUNT_ALL => { description => 'Count of elements in both neighbour sets', - uses_nbr_lists => 2, + uses_nbr_lists => 1, lumper => 1, }, }, @@ -1481,6 +1481,66 @@ sub calc_elements_used { } +sub get_metadata_calc_nonempty_elements_used { + my $self = shift; + + my %metadata = ( + name => 'Non-empty element counts', + description => "Counts of non-empty elements in neighbour sets 1 and 2.\n", + type => 'Lists and Counts', + pre_calc => 'calc_abc', + uses_nbr_lists => 1, # how many sets of lists it must have + indices => { + EL_COUNT_NONEMPTY_SET1 => { + description => 'Count of non-empty elements in neighbour set 1', + lumper => 0, + }, + EL_COUNT_NONEMPTY_SET2 => { + description => 'Count of non-empty elements in neighbour set 2', + uses_nbr_lists => 2, + lumper => 0, + }, + EL_COUNT_NONEMPTY_ALL => { + description => 'Count of non-empty elements in both neighbour sets', + uses_nbr_lists => 1, + lumper => 1, + }, + }, + ); + + return $metadata_class->new(\%metadata); +} + +sub calc_nonempty_elements_used { + my $self = shift; + my %args = @_; # rest of args into a hash + + # should run a precalc_gobal to check if the + # basedata has empty groups as then we can shortcut + my $bd = $self->get_basedata_ref; + my $list = $args{element_list_all}; + + my %nonempty; + foreach my $gp (@$list) { + my $ref = $bd->get_labels_in_group_as_hash (group => $gp); + next if !scalar keys %$ref; + $nonempty{$gp}++; + } + my $non_empty_all = scalar keys %nonempty; + my $non_empty_set1 = grep {exists $nonempty{$_}} keys %{$args{element_list1} // {}}; + my $non_empty_set2 = $args{element_list2} + ? grep {exists $nonempty{$_}} keys %{$args{element_list2}} + : undef; + + my %results = ( + EL_COUNT_NONEMPTY_SET1 => $non_empty_set1, + EL_COUNT_NONEMPTY_SET2 => $non_empty_set2, + EL_COUNT_NONEMPTY_ALL => $non_empty_all, + ); + + return wantarray ? %results : \%results; +} + sub get_metadata_calc_element_lists_used { my $self = shift; diff --git a/lib/Biodiverse/Indices/PhylogeneticRelative.pm b/lib/Biodiverse/Indices/PhylogeneticRelative.pm index c630d35ca..8f336e4df 100644 --- a/lib/Biodiverse/Indices/PhylogeneticRelative.pm +++ b/lib/Biodiverse/Indices/PhylogeneticRelative.pm @@ -7,6 +7,8 @@ use Data::Alias qw /alias/; use Carp; +our $VERSION = '1.99_001'; + my $metadata_class = 'Biodiverse::Metadata::Indices'; sub get_metadata_calc_phylo_rpd1 { diff --git a/lib/Biodiverse/Indices/RichnessEstimation.pm b/lib/Biodiverse/Indices/RichnessEstimation.pm new file mode 100644 index 000000000..a6ba5d9e5 --- /dev/null +++ b/lib/Biodiverse/Indices/RichnessEstimation.pm @@ -0,0 +1,1013 @@ +package Biodiverse::Indices::RichnessEstimation; + +use 5.016; + +use strict; +use warnings; +use Carp; + +use List::Util qw /max min sum/; + +our $VERSION = '1.99_001'; + +my $metadata_class = 'Biodiverse::Metadata::Indices'; + +use Readonly; + +Readonly my $z_for_ci => 1.959964; # currently hard coded for 0.95 + +sub get_metadata_calc_chao1 { + my %metadata = ( + description => 'Chao1 species richness estimator (abundance based)', + name => 'Chao1', + type => 'Richness estimators', + pre_calc => 'calc_abc3', + uses_nbr_lists => 1, # how many lists it must have + indices => { + CHAO1_ESTIMATE => { + description => 'Chao1 index', + reference => 'NEEDED', + formula => [], + }, + CHAO1_F1_COUNT => { + description => 'Number of singletons in the sample', + }, + CHAO1_F2_COUNT => { + description => 'Number of doubletons in the sample', + }, + CHAO1_SE => { + description => 'Standard error of the Chao1 estimator [= sqrt(variance)]', + }, + CHAO1_VARIANCE => { + description => 'Variance of the Chao1 estimator', + }, + CHAO1_UNDETECTED => { + description => 'Estimated number of undetected species', + }, + CHAO1_CI_LOWER => { + description => 'Lower confidence interval for the Chao1 estimate', + }, + CHAO1_CI_UPPER => { + description => 'Upper confidence interval for the Chao1 estimate', + }, + CHAO1_META => { + description => 'Metadata indicating which formulae were used in the ' + . 'calculations. Numbers refer to EstimateS equations at ' + . 'http://viceroy.eeb.uconn.edu/EstimateS/EstimateSPages/EstSUsersGuide/EstimateSUsersGuide.htm', + type => 'list', + }, + }, + ); + + return $metadata_class->new (\%metadata); +} + +sub calc_chao1 { + my $self = shift; + my %args = @_; + + my $label_hash = $args{label_hash_all}; + + my ($f1, $f2, $n) = (0, 0, 0); # singletons and doubletons + + foreach my $abundance (values %$label_hash) { + if ($abundance == 1) { + $f1 ++; + } + elsif ($abundance == 2) { + $f2 ++; + } + $n += $abundance; + } + + my $richness = scalar keys %$label_hash; + # correction factors + my $cn1 = $n ? ($n - 1) / $n : 1; # avoid divide by zero issues with empty sets + my $cn2 = $cn1 ** 2; + + my $chao_formula = 2; + my $chao_partial = 0; + my $variance; + # flags to use variance formaulae from EstimateS website + my $variance_uses_eq8 = !$f1; # no singletons + my $variance_uses_eq7; + + # if $f1 == $f2 == 0 then the partial is zero. + if ($f1) { + if ($f2) { # one or more doubletons + $chao_partial = $f1 ** 2 / (2 * $f2); + my $f12_ratio = $f1 / $f2; + $variance = $f2 * ( ($cn1 * $f12_ratio ** 2) / 2 + + $cn2 * $f12_ratio ** 3 + + ($cn2 * $f12_ratio ** 4) / 4); + } + elsif ($f1 > 1) { # no doubletons, but singletons + $chao_partial = $f1 * ($f1 - 1) / 2; + $variance_uses_eq7 = 1; # need the chao score to estimate this variance + } + else { + # if only one singleton and no doubletons then the estimate stays zero + $variance_uses_eq8 = 1; + $chao_formula = 0; + } + } + + $chao_partial *= $cn1; + + my $chao = $richness + $chao_partial; + + if ($variance_uses_eq7) { + $variance = $cn1 * ($f1 * ($f1 - 1)) / 2 + + $cn2 * $f1 * (2 * $f1 - 1)**2 / 4 + - $cn2 * $f1 ** 4 / 4 / $chao; + #$chao_formula = 0; + } + elsif ($variance_uses_eq8) { + my %sums; + foreach my $freq (values %$label_hash) { + $sums{$freq} ++; + } + my ($part1, $part2); + foreach my $i (keys %sums) { + my $f = $sums{$i}; + #say "$i $f"; + $part1 += $f * (exp (-$i) - exp (-2 * $i)); + $part2 += $i * exp (-$i) * $f; + } + $variance = $part1 - $part2 ** 2 / $n; + $chao_formula = 0; + } + + $variance = max (0, $variance); + + # and now the confidence interval + my $ci_scores = $self->_calc_chao_confidence_intervals ( + F1 => $f1, + F2 => $f2, + chao_score => $chao, + richness => $richness, + variance => $variance, + label_hash => $label_hash, + ); + + my $chao_meta = { + VARIANCE_FORMULA => $variance_uses_eq7 ? 7 : + $variance_uses_eq8 ? 8 : 6, + CHAO_FORMULA => $chao_formula, + CI_FORMULA => $variance_uses_eq8 ? 14 : 13, + }; + + my %results = ( + CHAO1_ESTIMATE => $chao, + CHAO1_F1_COUNT => $f1, + CHAO1_F2_COUNT => $f2, + CHAO1_SE => sqrt ($variance), + CHAO1_VARIANCE => $variance, + CHAO1_UNDETECTED => $chao_partial, + CHAO1_CI_LOWER => $ci_scores->{ci_lower}, + CHAO1_CI_UPPER => $ci_scores->{ci_upper}, + CHAO1_META => $chao_meta, + ); + + return wantarray ? %results : \%results; +} + + +sub get_metadata_calc_chao2 { + my %metadata = ( + description => 'Chao2 species richness estimator (incidence based)', + name => 'Chao2', + type => 'Richness estimators', + pre_calc => 'calc_abc2', + uses_nbr_lists => 1, # how many lists it must have + indices => { + CHAO2_ESTIMATE => { + description => 'Chao2 index', + reference => 'NEEDED', + formula => [], + }, + CHAO2_Q1_COUNT => { + description => 'Number of uniques in the sample', + }, + CHAO2_Q2_COUNT => { + description => 'Number of duplicates in the sample', + }, + CHAO2_VARIANCE => { + description => 'Variance of the Chao2 estimator', + }, + CHAO2_SE => { + description => 'Standard error of the Chao2 estimator [= sqrt (variance)]', + }, + CHAO2_CI_LOWER => { + description => 'Lower confidence interval for the Chao2 estimate', + }, + CHAO2_CI_UPPER => { + description => 'Upper confidence interval for the Chao2 estimate', + }, + CHAO2_UNDETECTED => { + description => 'Estimated number of undetected species', + }, + CHAO2_META => { + description => 'Metadata indicating which formulae were used in the ' + . 'calculations. Numbers refer to EstimateS equations at ' + . 'http://viceroy.eeb.uconn.edu/EstimateS/EstimateSPages/EstSUsersGuide/EstimateSUsersGuide.htm', + type => 'list', + }, + }, + ); + + return $metadata_class->new(\%metadata); +} + +# very similar to chao1 - could refactor common code +sub calc_chao2 { + my $self = shift; + my %args = @_; + + my $label_hash = $args{label_hash_all}; + my $R = $args{element_count_all}; + + my ($Q1, $Q2) = (0, 0); # singletons and doubletons + + foreach my $freq (values %$label_hash) { + if ($freq == 1) { + $Q1 ++; + } + elsif ($freq == 2) { + $Q2 ++; + } + } + + my $richness = scalar keys %$label_hash; + my $c1 = $R ? ($R - 1) / $R : 1; + my $c2 = $c1 ** 2; + + my $chao_partial = 0; + my $variance; + + # flags to use variance formaulae from EstimateS website + my $variance_uses_eq12 = !$Q1; # no uniques + my $variance_uses_eq11; + + my $chao_formula = 4; # eq 2 from EstimateS + + # if $f1 == $f2 == 0 then the partial is zero. + if ($Q1) { + if ($Q2) { # one or more doubletons + $chao_partial = $Q1 ** 2 / (2 * $Q2); + my $Q12_ratio = $Q1 / $Q2; + $variance = $Q2 * ( $Q12_ratio ** 2 * $c1 / 2 + + $Q12_ratio ** 3 * $c2 + + $Q12_ratio ** 4 * $c2 / 4); + } + elsif ($Q1 > 1) { # no doubletons, but singletons + $chao_partial = $Q1 * ($Q1 - 1) / 2; + $variance_uses_eq11 = 1; + } + elsif ($Q1) { + $variance_uses_eq12 = 1; + $chao_formula = 0; + } + # if only one singleton and no doubletons then chao stays zero + } + + $chao_partial *= $c1; + + my $chao = $richness + $chao_partial; + + if ($variance_uses_eq11) { + $variance = $c1 * ($Q1 * ($Q1 - 1)) / 2 + + $c2 * ($Q1 * (2 * $Q1 - 1) ** 2) / 4 + - $c2 * $Q1 ** 4 / 4 / $chao; + } + elsif ($variance_uses_eq12) { # same structure as eq8 - could refactor + my %sums; + foreach my $freq (values %$label_hash) { + $sums{$freq} ++; + } + my ($part1, $part2); + while (my ($i, $Q) = each %sums) { + $part1 += $Q * (exp (-$i) - exp (-2 * $i)); + $part2 += $i * exp (-$i) * $Q; + } + $variance = $part1 - $part2 ** 2 / $R; + $chao_formula = 0; + } + + $variance = max (0, $variance); + + # and now the confidence interval + my $ci_scores = $self->_calc_chao_confidence_intervals ( + F1 => $Q1, + F2 => $Q2, + chao_score => $chao, + richness => $richness, + variance => $variance, + label_hash => $label_hash, + ); + + my $chao_meta = { + VARIANCE_FORMULA => $variance_uses_eq11 ? 11 : + $variance_uses_eq12 ? 12 : 10, + CHAO_FORMULA => $chao_formula, + CI_FORMULA => $variance_uses_eq12 ? 14 : 13, + }; + + my %results = ( + CHAO2_ESTIMATE => $chao, + CHAO2_Q1_COUNT => $Q1, + CHAO2_Q2_COUNT => $Q2, + CHAO2_VARIANCE => $variance, + CHAO2_SE => sqrt ($variance), + CHAO2_UNDETECTED => $chao_partial, + CHAO2_CI_LOWER => $ci_scores->{ci_lower}, + CHAO2_CI_UPPER => $ci_scores->{ci_upper}, + CHAO2_META => $chao_meta, + ); + + return wantarray ? %results : \%results; +} + + +sub _calc_chao_confidence_intervals { + my $self = shift; + my %args = @_; + + my $f1 = $args{F1}; + my $f2 = $args{F2}; + my $chao = $args{chao_score}; + my $richness = $args{richness}; + my $variance = $args{variance}; + my $label_hash = $args{label_hash}; + + # and now the confidence interval + my ($lower, $upper); + if (($f1 && $f2) || ($f1 > 1)) { + my $T = $chao - $richness; + my $K; + eval { + no warnings qw /numeric uninitialized/; + $K = exp ($z_for_ci * sqrt (log (1 + $variance / $T ** 2))); + $lower = $richness + $T / $K; + $upper = $richness + $T * $K; + }; + } + else { + my $P = 0; + my %sums; + foreach my $freq (values %$label_hash) { + $sums{$freq} ++; + } + # set CIs to undefined if we only have singletons/uniques + if (! (scalar keys %sums == 1 && exists $sums{1})) { + while (my ($f, $count) = each %sums) { + $P += $count * exp (-$f); + } + $P /= $richness; + my $part1 = $richness / (1 - $P); + my $part2 = $z_for_ci * sqrt ($variance) / (1 - $P); + $lower = max ($richness, $part1 - $part2); + $upper = $part1 + $part2; + } + } + + my %results = ( + ci_lower => $lower, + ci_upper => $upper, + ); + + return wantarray ? %results : \%results; +} + +sub get_metadata_calc_ace { + my %metadata = ( + description => 'Abundance Coverage-based Estimator os species richness', + name => 'ACE', + type => 'Richness estimators', + pre_calc => 'calc_abc3', + uses_nbr_lists => 1, # how many lists it must have + reference => 'needed', + indices => { + ACE_ESTIMATE => { + description => 'ACE score', + }, + ACE_SE => { + description => 'ACE standard error', + }, + ACE_VARIANCE => { + description => 'ACE variance', + }, + ACE_CI_UPPER => { + description => 'ACE upper confidence interval estimate', + }, + ACE_CI_LOWER => { + description => 'ACE lower confidence interval estimate', + }, + ACE_UNDETECTED => { + description => 'Estimated number of undetected species', + }, + ACE_INFREQUENT_COUNT => { + description => 'Count of infrequent species', + }, + }, + ); + + return $metadata_class->new (\%metadata); +} + + +my %ace_ice_remap = ( + ACE_ESTIMATE => 'ICE_ESTIMATE', + ACE_SE => 'ICE_SE', + ACE_VARIANCE => 'ICE_VARIANCE', + ACE_CI_UPPER => 'ICE_CI_UPPER', + ACE_CI_LOWER => 'ICE_CI_LOWER', + ACE_UNDETECTED => 'ICE_UNDETECTED', + ACE_INFREQUENT_COUNT => 'ICE_INFREQUENT_COUNT', +); + +sub calc_ace { + my $self = shift; + my %args = @_; + + my $label_hash = $args{label_hash_all}; + + # Only the gamma differs between the two + # (apart from the inputs) + my $calc_ice = $args{calc_ice}; + my $t = $args{EL_COUNT_NONEMPTY_ALL}; + + my %f_rare; + my $S_abundants = 0; + my $S_rare = 0; + my $f1 = 0; + my $n_rare = 0; + my $richness = scalar keys %$label_hash; + my %all_freqs; # all taxa + + foreach my $freq (values %$label_hash) { + $all_freqs{$freq}++; + if ($freq <= 10) { + $f_rare{$freq} ++; + $n_rare += $freq; + $S_rare ++; + if ($freq == 1) { + $f1 ++; + } + } + else { + $S_abundants ++; + } + } + + # single sample loc for ICE + # or ACE and all samples are singletons + # or no labels + if (!$richness || ($calc_ice && $t <= 1) || (($f_rare{1} // 0) == $richness)) { + my %results; + @results{keys %ace_ice_remap} = undef; + $results{ACE_INFREQUENT_COUNT} = $S_rare; + $results{ACE_ESTIMATE} = $richness; + return wantarray ? %results : \%results; + } + + # if no rares or no singletons then use Chao1 or Chao2 as they handle such cases + if (!$n_rare || !$f_rare{1}) { + my %results = ( + ACE_INFREQUENT_COUNT => $S_rare, + ); + my $tmp_results; + my $pfx = 'ACE'; + if ($calc_ice) { + $tmp_results = $self->calc_chao2(%args); + $pfx = 'ICE'; + } + else { + $tmp_results = $self->calc_chao1(%args); + } + foreach my $key (keys %$tmp_results) { + next if $key =~ /(?:META|[QF][12]_COUNT)$/; + my $new_key = $key; + $new_key =~ s/^CHAO\d/$pfx/; + $results{$new_key} = $tmp_results->{$key}; + } + return wantarray ? %results : \%results; + } + + my $C_ace = 1 - $f1 / $n_rare; + + my ($a1, $a2); # $a names from SpadeR + for my $i (1 .. 10) { + no autovivification; + next if !$f_rare{$i}; + $a1 += $i * ($i-1) * $f_rare{$i}; + $a2 += $i * $f_rare{$i}; + } + + my $gamma_rare_hat_square; + if ($calc_ice) { + # Correction factor for C_ace from + # https://github.com/AnneChao/SpadeR::SpecInciModelh.R + # Seems undocumented as it is not in the cited refs, + # but we'll use it under the assumption that Chao's code is canonical. + my $A = 1; + if ($f_rare{1}) { + if ($f_rare{2}) { + $A = 2 * $f_rare{2} / (($t-1) * $f_rare{1} + 2 * $f_rare{2}); + } + else { + $A = 2 / (($t-1) * ($f_rare{1} - 1) + 2); + } + } + $C_ace = 1 - ($f1 / $n_rare) * (1 - $A); + $gamma_rare_hat_square = $t == 1 + ? 0 # avoid divide by zero + : ($S_rare / $C_ace) + * $t / ($t - 1) + * $a1 / $a2 / ($a2 - 1) + - 1; + } + else { + $gamma_rare_hat_square + = ($S_rare / $C_ace) + * $a1 / $a2 / ($a2 - 1) + - 1; + } + $gamma_rare_hat_square = max ($gamma_rare_hat_square, 0); + + my $S_ace = $S_abundants + + $S_rare / $C_ace + + $f1 / $C_ace * $gamma_rare_hat_square; + + my $cv = sqrt $gamma_rare_hat_square; + + my $variance_method = $calc_ice ? '_get_ice_variance' : '_get_ace_variance'; + + my $variance = $self->$variance_method ( + freq_counts => \%all_freqs, + f_rare => \%f_rare, + cv => $cv, + n_rare => $n_rare, + C_rare => $C_ace, + S_rare => $S_rare, + s_estimate => $S_ace, + t => $t, + ); + my $se = sqrt $variance; + + my $ci_vals = $self->_calc_ace_confidence_intervals ( + freq_counts => \%all_freqs, + variance => $variance, + s_estimate => $S_ace, + richness => $richness, + label_hash => $label_hash, + ); + + my %results = ( + ACE_ESTIMATE => $S_ace, + ACE_SE => $se, + ACE_UNDETECTED => $S_ace - $richness, + ACE_VARIANCE => $variance, + ACE_CI_LOWER => $ci_vals->{ci_lower}, + ACE_CI_UPPER => $ci_vals->{ci_upper}, + ACE_INFREQUENT_COUNT => $S_rare, + ); + + return wantarray ? %results : \%results; +} + + +sub get_metadata_calc_ice { + my %metadata = ( + description => 'Incidence Coverage-based Estimator of species richness', + name => 'ICE', + type => 'Richness estimators', + pre_calc => [qw /calc_abc2 calc_nonempty_elements_used/], + uses_nbr_lists => 1, # how many lists it must have + indices => { + ICE_ESTIMATE => { + description => 'ICE score', + }, + ICE_SE => { + description => 'ICE standard error', + }, + ICE_VARIANCE => { + description => 'ICE variance', + }, + ICE_CI_UPPER => { + description => 'ICE upper confidence interval estimate', + }, + ICE_CI_LOWER => { + description => 'ICE lower confidence interval estimate', + }, + ICE_UNDETECTED => { + description => 'Estimated number of undetected species', + }, + ICE_INFREQUENT_COUNT => { + description => 'Count of infrequent species', + }, + }, + ); + + return $metadata_class->new (\%metadata); +} + +sub calc_ice { + my $self = shift; + my %args = @_; + + my $tmp_results = $self->calc_ace (%args, calc_ice => 1); + my %results; + foreach my $ace_key (keys %$tmp_results) { + my $ice_key = $ace_key; + $ice_key =~ s/^ACE/ICE/; + $results{$ice_key} = $tmp_results->{$ace_key}; + }; + + return wantarray ? %results : \%results; +} + +# almost identical to _get_ice_variance but integrating the two +# would prob result in more complex code +sub _get_ace_variance { + my $self = shift; + my %args = @_; + + my $freq_counts = $args{freq_counts}; + + # precalculate the differentials and covariances + my (%diff, %cov); + foreach my $i (sort {$a <=> $b} keys %$freq_counts) { + $diff{$i} = $self->_get_ace_differential (%args, f => $i); + foreach my $j (sort {$a <=> $b} keys %$freq_counts) { + $cov{$i}{$j} = $cov{$j}{$i} + // $self->_get_ace_ice_cov (%args, i => $i, j => $j); + } + } + + my $var_ace = 0; + foreach my $i (keys %$freq_counts) { + foreach my $j (keys %$freq_counts) { + my $partial + = $diff{$i} * $diff{$j} * $cov{$i}{$j}; + $var_ace += $partial; + } + } + + $var_ace ||= undef; + + return $var_ace; +} + +sub _get_ice_variance { + my $self = shift; + my %args = @_; + + my $freq_counts = $args{freq_counts}; + + # precalculate the differentials and covariances + my (%diff, %cov); + foreach my $i (sort {$a <=> $b} keys %$freq_counts) { + $diff{$i} = $self->_get_ice_differential (%args, f => $i); + foreach my $j (sort {$a <=> $b} keys %$freq_counts) { + $cov{$i}{$j} = $cov{$j}{$i} + // $self->_get_ace_ice_cov (%args, i => $i, j => $j); + } + } + + my $var_ice = 0; + foreach my $i (keys %$freq_counts) { + foreach my $j (keys %$freq_counts) { + my $partial + = $diff{$i} * $diff{$j} * $cov{$i}{$j}; + $var_ice += $partial; + } + } + + $var_ice ||= undef; + + return $var_ice; +} + +# common to ACE and ICE +sub _get_ace_ice_cov { + my $self = shift; + my %args = @_; + my ($i, $j, $s_ice) = @args{qw/i j s_estimate/}; + my $Q = $args{freq_counts}; + + my $covf; + if ($i == $j) { + $covf = $Q->{$i} * (1 - $Q->{$i} / $s_ice); + } + else { + $covf = -1 * $Q->{$i} * $Q->{$j} / $s_ice; + } + + return $covf; +} + + +sub _get_ice_differential { + my $self = shift; + my %args = @_; + + no autovivification; + + my $k = 10; # later we will make this an argument + + my $q = $args{q} // $args{f}; + + return 1 if $q > $k; + + my $CV_infreq_h = $args{cv}; + my $freq_counts = $args{freq_counts}; + my $n_infreq = $args{n_rare}; + my $C_infreq = $args{C_rare}; # get from gamma calcs + my $D_infreq = $args{S_rare}; # richness of labels with sample counts < $k + my $Q = $args{f_rare}; + my $t = $args{t}; + + my @u = (1..$k); + + $n_infreq //= + sum + map {$_ * $freq_counts->{$_}} + grep {$_ < $k} + keys %$freq_counts; + + my $si = sum map {$_ * ($_-1) * ($Q->{$_} // 0)} @u; + + my ($Q1, $Q2) = @$Q{1,2}; + $Q1 //= 0; + $Q2 //= 0; + + my ($d, $dc_infreq); + + if ($CV_infreq_h != 0) { + if ($q == 1) { + $dc_infreq = + -1 * ( + $n_infreq * (($t - 1) * $Q1 + 2 * $Q2) * 2 * $Q1 * ($t - 1) + - ($t - 1) * $Q1**2 * (($t - 1) * ($Q1 + $n_infreq) + 2 * $Q2) + ) + / ($n_infreq * (($t - 1) * $Q1 + 2 * $Q2)) ** 2; + + $d = ($C_infreq - $D_infreq * $dc_infreq) / $C_infreq ** 2 + + $t / ($t - 1) + * ($C_infreq**2*$n_infreq*($n_infreq - 1) + * ($D_infreq * $si + $Q1 * $si) + - $Q1 * $D_infreq * $si * + (2 * $C_infreq * $dc_infreq + * $n_infreq * ($n_infreq - 1) + + $C_infreq ** 2 + * ($n_infreq - 1) + + $C_infreq ** 2 + * $n_infreq + ) + ) / $C_infreq ** 4 / $n_infreq ** 2 / ($n_infreq - 1) ** 2 + - ($C_infreq - $Q1 * $dc_infreq) / $C_infreq**2; + } + elsif ($q == 2){ + $dc_infreq + = -( -($t - 1) * $Q1**2 * + (2 * ($t - 1) * $Q1 + 2 * ($n_infreq + 2 * $Q2)) + ) + / + ($n_infreq * (($t - 1) * $Q1 + 2 * $Q2))**2; + + $d = ($C_infreq - $D_infreq * $dc_infreq) + / $C_infreq**2 + + $t / ($t - 1) + * ($C_infreq**2 * $n_infreq * ($n_infreq - 1) * $Q1 * ($si + 2 * $D_infreq) - $Q1 * $D_infreq * $si * + (2 * $C_infreq * $dc_infreq * $n_infreq * ($n_infreq - 1) + $C_infreq**2 * 2 * ($n_infreq - 1) + $C_infreq**2 * $n_infreq * 2) + ) + / $C_infreq**4 / $n_infreq**2 / ($n_infreq - 1)**2 + - ( -$Q1 * $dc_infreq) / $C_infreq**2; + } + else { + $dc_infreq = + - ( - ($t - 1) * $Q1**2 * (($t - 1) * $Q1 * $q + 2 * $Q2 * $q)) + / ($n_infreq * (($t - 1) * $Q1 + 2 * $Q2))**2; + + $d = ($C_infreq - $D_infreq * $dc_infreq) / $C_infreq**2 + + $t/($t - 1) + * ($C_infreq**2 * $n_infreq * ($n_infreq - 1) * $Q1 * ($si + $q * ($q - 1) * $D_infreq) - $Q1 * $D_infreq * $si + * (2 * $C_infreq * $dc_infreq * $n_infreq * ($n_infreq - 1) + + $C_infreq**2 * $q * ($n_infreq - 1) + + $C_infreq**2 * $n_infreq * $q + ) + ) + / $C_infreq**4 / $n_infreq**2 / ($n_infreq - 1)**2 + - ( - $Q1 * $dc_infreq) / $C_infreq**2; + } + } + else { + if ($q == 1) { + $dc_infreq + = -1 * + ($n_infreq * (($t - 1) * $Q1 + 2 * $Q2) * 2 * $Q1 * ($t - 1) + - ($t - 1) * $Q1**2 * (($t - 1) * ($Q1 + $n_infreq) + 2 * $Q2) + ) + / ($n_infreq * (($t - 1) * $Q1 + 2 * $Q2))**2; + } + elsif ($q == 2) { + $dc_infreq + = -1 * + ( -1 * ($t - 1) * $Q1**2 * + (2 * ($t - 1) * $Q1 + 2 * ($n_infreq + 2 * $Q2)) + ) + / ($n_infreq * (($t - 1) * $Q1 + 2 * $Q2))**2; + } + else { + $dc_infreq + = -1 * + ( -1 * ($t - 1) * $Q1**2 * (($t - 1) * $Q1 * $q + 2 * $Q2 * $q)) + / ($n_infreq * (($t - 1) * $Q1 + 2 * $Q2))**2; + } + $d = ($C_infreq - $D_infreq * $dc_infreq) / $C_infreq**2; + } + + return $d; +} + +sub _get_ace_differential { + my $self = shift; + my %args = @_; + + my $k = 10; # later we will make this an argument + + my $f = $args{f}; + + return 1 if $f > $k; + + no autovivification; + + my $cv_rare_h = $args{cv}; + my $n_rare = $args{n_rare}; + my $c_rare = $args{C_rare}; # get from gamma calcs + my $D_rare = $args{S_rare}; # richness of labels with sample counts < $k + my $F = $args{freq_counts}; + my $t = $args{t}; + + my @u = (1..$k); + + $n_rare //= + sum + map {$_ * $F->{$_}} + grep {$_ <= $k} + keys %$F; + + my $si = sum map {$_ * ($_-1) * ($F->{$_} // 0)} @u; + + my $f1 = $F->{1}; + my $d; + + if ($cv_rare_h != 0) { + if ($f == 1) { + $d = (1 - $f1 / $n_rare + $D_rare * ($n_rare - $f1) / $n_rare**2) + / (1 - $f1/$n_rare)**2 + + + ( + (1 - $f1/$n_rare)**2 * $n_rare * ($n_rare - 1) + * ($D_rare * $si + $f1 * $si) + - $f1 * $D_rare * $si + * (-2 * (1 - $f1 / $n_rare) * ($n_rare - $f1) / $n_rare**2 + * $n_rare * ($n_rare - 1) + + (1 - $f1/$n_rare)**2*(2*$n_rare - 1) + ) + ) + / (1 - $f1/$n_rare)**4 / $n_rare**2 / ($n_rare - 1)**2 + - (1 - $f1 / $n_rare + $f1 * ($n_rare - $f1) + / $n_rare**2) + / (1 - $f1 / $n_rare)**2; + } + else { + $d = (1 - $f1 / $n_rare - $D_rare * $f * $f1 / $n_rare**2) + / (1 - $f1 / $n_rare)**2 + + ( + (1 - $f1 / $n_rare)**2 + * $n_rare * ($n_rare - 1) * $f1 * + ($si + $D_rare * $f * ($f - 1)) + - $f1 * $D_rare * $si * + (2 * (1 - $f1 / $n_rare) * $f1 * $f / $n_rare**2 + * $n_rare * ($n_rare - 1) + + (1 - $f1 / $n_rare)**2 * $f * ($n_rare - 1) + + (1 - $f1 / $n_rare)**2 * $n_rare * $f + ) + ) + / (1 - $f1/$n_rare)**4 / ($n_rare)**2 / ($n_rare - 1)**2 + + ($f * $f1**2 / $n_rare**2) + / (1 - $f1 / $n_rare)**2; + } + } + else { + if ($f == 1) { + $d = (1 - $f1 / $n_rare + $D_rare * ($n_rare - $f1) / $n_rare**2) + / (1 - $f1 / $n_rare)**2; + } + else { + $d = (1 - $f1 / $n_rare - $D_rare * $f * $f1 / $n_rare**2) + / (1 - $f1 / $n_rare)**2; + } + } + + return $d; +} + + +sub _calc_ace_confidence_intervals { + my $self = shift; + my %args = @_; + + my $estimate = $args{s_estimate}; + my $richness = $args{richness}; + my $variance = $args{variance}; + my $label_hash = $args{label_hash}; + my $freq_counts = $args{freq_counts}; + + # and now the confidence interval + my ($lower, $upper); + + # SpadeR treats values this close to zero as zero + if (($estimate - $richness) >= 0.00001) { + my $T = $estimate - $richness; + my $K = exp ($z_for_ci * sqrt (log (1 + $variance / $T**2))); + $lower = $richness + $T / $K; + $upper = $richness + $T * $K; + } + else { + my ($part1, $part2, $P) = (0, 0, 0); + foreach my $f (keys %$freq_counts) { + $part1 += $freq_counts->{$f} * (exp(-$f) - exp(-2*$f)); + $part2 += $f * exp (-$f) * $freq_counts->{$f}; + $P += $freq_counts->{$f} * exp (-$f) / $richness; + } + my $n = sum values %$label_hash; + my $var_obs = $part1 - $part2**2 / $n; # should be passed as an arg? +#say "var_obs is the same as the variance argument\n" if $var_obs == $variance; +#say "$n $richness $P $var_obs $part1 $part2"; + $lower = max($richness, $richness / (1 - $P) - $z_for_ci * sqrt($var_obs) / (1 - $P)); + $upper = $richness / (1 - $P) + $z_for_ci * sqrt($var_obs) / (1 - $P); + } + + my %results = ( + ci_lower => $lower, + ci_upper => $upper, + ); + + return wantarray ? %results : \%results; +} +1; + +__END__ + +=head1 NAME + +Biodiverse::Indices::EstimateS + +=head1 SYNOPSIS + + use Biodiverse::Indices; + +=head1 DESCRIPTION + +Species richness estimation indices for the Biodiverse system, +based on the EstimateS (L) +and SpadeR (L) software. + +It is inherited by Biodiverse::Indices and not to be used on it own. + +See L for more details. + +=head1 METHODS + +=over + +=item INSERT METHODS + +=back + +=head1 REPORTING ERRORS + +Use the issue tracker at http://www.purl.org/biodiverse + +=head1 COPYRIGHT + +Copyright (c) 2010 Shawn Laffan. All rights reserved. + +=head1 LICENSE + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +For a full copy of the license see . + +=cut diff --git a/lib/Biodiverse/Randomise.pm b/lib/Biodiverse/Randomise.pm index d450b07f8..923a609fe 100644 --- a/lib/Biodiverse/Randomise.pm +++ b/lib/Biodiverse/Randomise.pm @@ -1614,6 +1614,19 @@ sub swap_to_reach_richness_targets { BY_UNFILLED_GP: while (scalar keys %unfilled_groups) { + +# debugging +#my %xx; +#foreach my $lb (keys %unfilled_gps_without_label) { +# my $lref = $unfilled_gps_without_label{$lb}; +# foreach my $gp (@$lref) { +# $xx{$gp}{$lb}++; +# } +#} +#use Test::More; +#Test::More::is_deeply (\%xx, \%unfilled_gps_without_label_by_gp, 'match'); + + my $target_label_count = $cloned_bd->get_label_count; my $target_group_count = $cloned_bd->get_group_count; diff --git a/lib/Biodiverse/TreeNode.pm b/lib/Biodiverse/TreeNode.pm index c372f476f..4d1191f9e 100644 --- a/lib/Biodiverse/TreeNode.pm +++ b/lib/Biodiverse/TreeNode.pm @@ -235,7 +235,7 @@ sub reset_total_length_below { my %descendents = $self->get_all_descendants; foreach my $child (values %descendents) { - $child->reset_total_length; + $child->reset_total_length_below; } } @@ -814,7 +814,7 @@ sub get_terminal_elements { return wantarray ? %$cache_ref : $cache_ref if defined $cache_ref; } - + my %list; if ($self->is_terminal_node) { diff --git a/t/24-Indices_lists-and-counts.t b/t/24-Indices_lists-and-counts.t index a0676ab0d..661a2b81e 100644 --- a/t/24-Indices_lists-and-counts.t +++ b/t/24-Indices_lists-and-counts.t @@ -10,27 +10,106 @@ use Test::Most; use Biodiverse::TestHelpers qw{ :runners + :basedata }; -run_indices_test1 ( - calcs_to_test => [qw/ - calc_elements_used - calc_element_lists_used - calc_abc_counts - calc_d - calc_local_range_lists - calc_local_range_stats - calc_redundancy - calc_richness - calc_local_sample_count_lists - calc_local_sample_count_stats - calc_label_count_quantile_position - calc_local_sample_count_quantiles - /], - calc_topic_to_test => 'Lists and Counts', - sort_array_lists => 1, - #generate_result_sets => 1, -); +use Devel::Symdump; +my $obj = Devel::Symdump->rnew(__PACKAGE__); +my @subs = grep {$_ =~ 'main::test_'} $obj->functions(); + +exit main( @ARGV ); + +sub main { + my @args = @_; + + if (@args) { + for my $name (@args) { + die "No test method test_$name\n" + if not my $func = (__PACKAGE__->can( 'test_' . $name ) || __PACKAGE__->can( $name )); + $func->(); + } + done_testing; + return 0; + } + + + foreach my $sub (sort @subs) { + no strict 'refs'; + $sub->(); + } + + done_testing; + return 0; +} + +sub test_main { + run_indices_test1 ( + calcs_to_test => [qw/ + calc_nonempty_elements_used + calc_elements_used + calc_element_lists_used + calc_abc_counts + calc_d + calc_local_range_lists + calc_local_range_stats + calc_redundancy + calc_richness + calc_local_sample_count_lists + calc_local_sample_count_stats + calc_label_count_quantile_position + calc_local_sample_count_quantiles + /], + calc_topic_to_test => 'Lists and Counts', + sort_array_lists => 1, + #generate_result_sets => 1, + ); +} + + +sub test_el_counts_with_empty_groups { + my $bd = get_basedata_object_from_site_data (CELL_SIZES => [200000, 200000]); + + my $gp_count = $bd->get_group_count; + my @nbr_set2 = $bd->get_groups; + + # add four empty groups + my $empty_gp_count = 4; + my @nbr_set1; + for my $i (1..$empty_gp_count) { + # should really insert using coords + my $coord = -$i * 200000 + 100000; + my $gp_name = "$coord:$coord"; + $bd->add_element (group => $gp_name); + push @nbr_set1, $gp_name; + } + + my $results2 = { + EL_COUNT_NONEMPTY_ALL => $gp_count, + EL_COUNT_NONEMPTY_SET1 => 0, + EL_COUNT_NONEMPTY_SET2 => $gp_count, + EL_COUNT_ALL => $gp_count + $empty_gp_count, + EL_COUNT_SET1 => $empty_gp_count, + EL_COUNT_SET2 => $gp_count, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_nonempty_elements_used + calc_elements_used + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => \@nbr_set1, + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_el_counts_with_empty_groups', + ); + + return; +} done_testing; @@ -203,6 +282,9 @@ __DATA__ EL_COUNT_ALL => 5, EL_COUNT_SET1 => 1, EL_COUNT_SET2 => 4, + EL_COUNT_NONEMPTY_ALL => 5, + EL_COUNT_NONEMPTY_SET1 => 1, + EL_COUNT_NONEMPTY_SET2 => 4, EL_LIST_ALL => [ '3250000:850000', '3350000:850000', '3350000:750000', '3350000:950000', @@ -280,6 +362,9 @@ __DATA__ ABC3_SUM_SET1 => 6, ABC_D => 29, EL_COUNT_SET1 => 1, + EL_COUNT_ALL => 1, + EL_COUNT_NONEMPTY_ALL => 1, + EL_COUNT_NONEMPTY_SET1 => 1, EL_LIST_SET1 => { '3350000:850000' => 1 }, LABEL_COUNT_RANK_PCT => { 'Genus:sp20' => undef, diff --git a/t/24-Indices_richness_estimators.t b/t/24-Indices_richness_estimators.t new file mode 100644 index 000000000..952321e90 --- /dev/null +++ b/t/24-Indices_richness_estimators.t @@ -0,0 +1,947 @@ +use strict; +use warnings; +use 5.016; + +local $| = 1; + +use rlib; +use Test::Most; + +use Readonly; + +use Biodiverse::TestHelpers qw{ + :runners + :basedata +}; +use Data::Section::Simple qw(get_data_section); + +use Devel::Symdump; +my $obj = Devel::Symdump->rnew(__PACKAGE__); +my @subs = grep {$_ =~ 'main::test_'} $obj->functions(); + +# a few package "constants" +my $bd = get_basedata(); +Readonly my $focal_gp => 'Broad_Meadow_Brook'; +Readonly my @nbr_set2 => grep {$_ ne $focal_gp} $bd->get_groups; + +exit main( @ARGV ); + +sub main { + my @args = @_; + + if (@args) { + for my $name (@args) { + die "No test method test_$name\n" + if not my $func = (__PACKAGE__->can( 'test_' . $name ) || __PACKAGE__->can( $name )); + $func->($bd); + } + done_testing; + return 0; + } + + + foreach my $sub (sort @subs) { + no strict 'refs'; + $sub->($bd); + } + + done_testing; + return 0; +} + + +sub test_indices { + my $bd = shift->clone; + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_chao1 + calc_chao2 + calc_ace + calc_ice + /], + calc_topic_to_test => 'Richness estimators', + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + descr_suffix => 'main tests', + #generate_result_sets => 1, + ); + + return; +} + +sub test_indices_1col { + my $bd = shift->clone; + + $bd->delete_groups (groups => \@nbr_set2); + + my $results_overlay2 = { + CHAO1_ESTIMATE => '15.5555555555556', + CHAO1_CI_LOWER => '8.93051', + CHAO1_CI_UPPER => '69.349636', + CHAO1_F1_COUNT => 4, + CHAO1_F2_COUNT => 1, + CHAO1_META => { + CHAO_FORMULA => 2, + CI_FORMULA => 13, + VARIANCE_FORMULA => 6 + }, + CHAO1_UNDETECTED => '7.55555555555556', + CHAO1_VARIANCE => 121.728395, + CHAO1_SE => 11.033059, + CHAO2_ESTIMATE => 8, + CHAO2_CI_LOWER => undef, + CHAO2_CI_UPPER => undef, + CHAO2_META => { + CHAO_FORMULA => 4, + CI_FORMULA => 13, + VARIANCE_FORMULA => 11 + }, + CHAO2_Q1_COUNT => 8, + CHAO2_Q2_COUNT => 0, + CHAO2_UNDETECTED => 0, + CHAO2_VARIANCE => 0, + CHAO2_SE => 0, + }; + # identical to overlay2 since we have only one group in the basedata + my $results_overlay1 = {%$results_overlay2}; + + my %expected_results_overlay = ( + 1 => $results_overlay1, + 2 => $results_overlay2, + ); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_chao1 + calc_chao2 + /], + #calc_topic_to_test => 'Richness estimators', + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => [], + expected_results => \%expected_results_overlay, + descr_suffix => 'one group', + ); + + return; +} + + +# chao2 differs when q1 or q2 are zero +sub test_chao1_F2_no_F1 { + my $bd = shift->clone; + + # need to ensure there are no uniques - bump their sample counts + foreach my $label ($bd->get_labels) { + if ($bd->get_label_sample_count(element => $label) == 1) { + $bd->add_element (group => $focal_gp, label => $label, count => 20); + } + } + + my $results2 = { + CHAO1_ESTIMATE => 26, CHAO1_SE => 0.8749986, + CHAO1_F1_COUNT => 0, CHAO1_F2_COUNT => 5, + CHAO1_UNDETECTED => 0, CHAO1_VARIANCE => 0.7656226, + CHAO1_CI_LOWER => 26, CHAO1_CI_UPPER => 28.680536, + CHAO1_META => { + CHAO_FORMULA => 0, + CI_FORMULA => 14, + VARIANCE_FORMULA => 8, + }, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_chao1 + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_chao1_F2_no_F1', + ); + + return; +} + +sub test_chao1_F1_no_F2 { + my $bd = shift->clone; + + # need to ensure there are no doubles + foreach my $label ($bd->get_labels) { + if ($bd->get_label_sample_count(element => $label) == 2) { + $bd->add_element (group => $focal_gp, label => $label, count => 20); + } + } + + my $results2 = { + CHAO1_ESTIMATE => 31.986014, CHAO1_SE => 7.264041, + CHAO1_F1_COUNT => 4, CHAO1_F2_COUNT => 0, + CHAO1_UNDETECTED => 5.986014, CHAO1_VARIANCE => 52.766285, + CHAO1_CI_LOWER => 26.927382, CHAO1_CI_UPPER => 64.638212, + CHAO1_META => { + CHAO_FORMULA => 2, + CI_FORMULA => 13, + VARIANCE_FORMULA => 7, + }, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_chao1 + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_chao1_F1_no_F2', + ); + + return; +} + +sub test_chao1_no_F1_no_F2 { + my $bd = shift->clone; + + # need to ensure there are no uniques or doubles - make them all occur everywhere + foreach my $label ($bd->get_labels) { + my $sc = $bd->get_label_sample_count (element => $label); + if ($sc == 1 || $sc == 2) { + $bd->add_element (group => $focal_gp, label => $label, count => 20); + } + } + + my $results2 = { + CHAO1_ESTIMATE => 26, CHAO1_SE => 0.43544849, + CHAO1_F1_COUNT => 0, CHAO1_F2_COUNT => 0, + CHAO1_UNDETECTED => 0, CHAO1_VARIANCE => 0.1896153894, + CHAO1_CI_LOWER => 26, CHAO1_CI_UPPER => 27.060214, + CHAO1_META => { + CHAO_FORMULA => 0, + CI_FORMULA => 14, + VARIANCE_FORMULA => 8, + }, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_chao1 + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_chao1_no_F1_no_F2', + ); + + return; +} +# chao2 differs when q1 or q2 are zero +sub test_chao2_Q2_no_Q1 { + my $bd = shift->clone; + + # need to ensure there are no uniques - make them all occur everywhere + foreach my $label ($bd->get_labels) { + if ($bd->get_range(element => $label) == 1) { + foreach my $group ($bd->get_groups) { + $bd->add_element (group => $group, label => $label); + } + } + } + + my $results2 = { + CHAO2_ESTIMATE => 26, CHAO2_SE => 0.629523, + CHAO2_Q1_COUNT => 0, CHAO2_Q2_COUNT => 8, + CHAO2_UNDETECTED => 0, CHAO2_VARIANCE => 0.396299, + CHAO2_CI_LOWER => 26.030051, CHAO2_CI_UPPER => 28.623668, + CHAO2_META => { + CHAO_FORMULA => 0, + CI_FORMULA => 14, + VARIANCE_FORMULA => 12, + }, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_chao2 + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_chao2_Q2_no_Q1', + ); + + return; +} + +sub test_chao2_Q1_no_Q2 { + my $bd = shift->clone; + + # need to ensure there are no uniques - make them all occur everywhere + foreach my $label ($bd->get_labels) { + if ($bd->get_range(element => $label) == 2) { + foreach my $group ($bd->get_groups) { + $bd->add_element (group => $group, label => $label); + } + } + } + + my $results2 = { + CHAO2_ESTIMATE => 39.636364, CHAO2_SE => 12.525204, + CHAO2_Q1_COUNT => 6, CHAO2_Q2_COUNT => 0, + CHAO2_UNDETECTED => 13.636364, CHAO2_VARIANCE => 156.880734, + CHAO2_CI_LOWER => 28.943958, CHAO2_CI_UPPER => 89.163404, + CHAO2_META => { + CHAO_FORMULA => 4, + CI_FORMULA => 13, + VARIANCE_FORMULA => 11, + }, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_chao2 + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_chao2_Q1_no_Q2', + ); + + return; +} + +sub test_chao2_no_Q1_no_Q2 { + my $bd = shift->clone; + + # need to ensure there are no uniques or doubles - make them all occur everywhere + foreach my $label ($bd->get_labels) { + my $range = $bd->get_range(element => $label); + if ($range == 1 || $range == 2) { + foreach my $group ($bd->get_groups) { + $bd->add_element (group => $group, label => $label); + } + } + } + + my $results2 = { + CHAO2_ESTIMATE => 26, CHAO2_SE => 0.3696727, + CHAO2_Q1_COUNT => 0, CHAO2_Q2_COUNT => 0, + CHAO2_UNDETECTED => 0, CHAO2_VARIANCE => 0.1366579, + CHAO2_CI_LOWER => 26, CHAO2_CI_UPPER => 26.910731, + CHAO2_META => { + CHAO_FORMULA => 0, + CI_FORMULA => 14, + VARIANCE_FORMULA => 12, + }, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_chao2 + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_chao2_no_Q1_no_Q2', + ); + + return; +} + + +sub test_ICE { + my $bd = shift->clone; + + my $results2 = { + ICE_ESTIMATE => 29.606691, + ICE_SE => 3.130841, + ICE_VARIANCE => 9.8021639498, + ICE_CI_LOWER => 26.83023165, + ICE_CI_UPPER => 41.668186, + ICE_UNDETECTED => 3.606691, + ICE_INFREQUENT_COUNT => 24, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ice + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ICE base', + ); + + return; +} + +sub test_ICE_no_singletons { + my $bd = shift->clone; + + # need to ensure there are no uniques - make them all occur everywhere + foreach my $label ($bd->get_labels) { + if ($bd->get_range(element => $label) == 1) { + foreach my $group ($bd->get_groups) { + $bd->add_element (group => $group, label => $label); + } + } + } + + my $results2 = { + ICE_ESTIMATE => 26, + ICE_SE => 0.6295226, + ICE_VARIANCE => 0.3962987, + ICE_CI_LOWER => 26.0300513322865, + ICE_CI_UPPER => 28.623668086166, + ICE_UNDETECTED => 0, + ICE_INFREQUENT_COUNT => 18, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ice + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ICE_no_singletons', + ); + + return; +} + +sub test_ICE_one_group { + my $bd = shift->clone; + + my $results2 = { + ICE_ESTIMATE => 8, + ICE_SE => undef, + ICE_VARIANCE => undef, + ICE_CI_LOWER => undef, + ICE_CI_UPPER => undef, + ICE_UNDETECTED => undef, + ICE_INFREQUENT_COUNT => 8, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ice + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => [], + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ICE_one_group', + ); + + return; +} + +# should match the chao2 estimate in this case +sub test_ICE_no_infrequents { + my $bd = shift->clone; + + foreach my $label ($bd->get_labels) { + my $range = $bd->get_range(element => $label); + if ($range && $range <= 10) { + foreach my $group ($bd->get_groups) { + $bd->add_element (group => $group, label => $label); + } + } + } + + my $results2 = { + ICE_ESTIMATE => 26, + ICE_SE => 0.020789, + ICE_VARIANCE => 0.000432, + ICE_CI_LOWER => 26, + ICE_CI_UPPER => 26.04118, + ICE_UNDETECTED => 0, + ICE_INFREQUENT_COUNT => 0, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ice + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ICE_no_infrequents', + ); + + return; +} + + +sub test_ACE { + my $bd = shift->clone; + + my $results2 = { + ACE_ESTIMATE => 28.459957, + ACE_SE => 2.457292, + ACE_VARIANCE => 6.03828211669945, + ACE_CI_LOWER => 26.4817368287846, + ACE_CI_UPPER => 38.561607, + ACE_UNDETECTED => 2.459957, + ACE_INFREQUENT_COUNT => 19, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ace + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ACE base', + ); + + return; +} + +sub test_ACE_no_rares { + my $bd = shift->clone; + + # need to ensure there are no rares + foreach my $label ($bd->get_labels) { + my $count = $bd->get_label_sample_count(element => $label); + if ($count && $count <= 10) { + $bd->add_element (group => $focal_gp, label => $label, count => 20); + } + } + + my $results2 = { + ACE_ESTIMATE => 26, + ACE_SE => 0.002129, + ACE_VARIANCE => 5e-006, + ACE_CI_LOWER => 26, + ACE_CI_UPPER => 26.004177, + ACE_UNDETECTED => 0, + ACE_INFREQUENT_COUNT => 0, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ace + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ACE_no_rares', + ); + + return; +} + +sub test_ACE_no_singletons { + my $bd = shift->clone; + + # need to ensure there are no singletons + foreach my $label ($bd->get_labels) { + my $count = $bd->get_label_sample_count(element => $label); + if ($count == 1) { + $bd->add_element (group => $focal_gp, label => $label, count => 20); + } + } + + my $results2 = { + ACE_ESTIMATE => 26, + ACE_SE => 0.874999, + ACE_VARIANCE => 0.765623, + ACE_CI_LOWER => 26, + ACE_CI_UPPER => 28.680536, + ACE_UNDETECTED => 0, + ACE_INFREQUENT_COUNT => 15, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ace + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ACE_no_singletons', + ); + + return; +} + +sub test_ACE_only_singletons { + my $bd = shift->clone; + + # need to ensure there are only singletons in $focal_gp + foreach my $label ($bd->get_labels_in_group(group => $focal_gp)) { + $bd->delete_sub_element (group => $focal_gp, label => $label); + $bd->add_element (group => $focal_gp, label => $label, count => 1); + } + + my $results2 = { + ACE_ESTIMATE => 8, + ACE_SE => undef, + ACE_VARIANCE => undef, + ACE_CI_LOWER => undef, + ACE_CI_UPPER => undef, + ACE_UNDETECTED => undef, + ACE_INFREQUENT_COUNT => 8, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ace + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => [], + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ACE_only_singletons', + ); + + return; +} + +sub test_ACE_empty_group { + my $bd = shift->clone; + + # empty $focal_gp + foreach my $label ($bd->get_labels_in_group(group => $focal_gp)) { + $bd->delete_sub_element (group => $focal_gp, label => $label); + } + $bd->add_element (group => $focal_gp, count => 0); + + my $results2 = { + ACE_ESTIMATE => 0, + ACE_SE => undef, + ACE_VARIANCE => undef, + ACE_CI_LOWER => undef, + ACE_CI_UPPER => undef, + ACE_UNDETECTED => undef, + ACE_INFREQUENT_COUNT => 0, # trigger failure + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ace + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => [], + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ACE_empty_group', + ); + + return; +} + +sub test_ICE_all_groups_empty { + my $bd = shift->clone; + + foreach my $group ($bd->get_groups ) { + $bd->delete_group (group => $group); + $bd->add_element (group => $group, count => 0); + } + + my $results2 = { + ICE_ESTIMATE => 0, + ICE_SE => undef, + ICE_VARIANCE => undef, + ICE_CI_LOWER => undef, + ICE_CI_UPPER => undef, + ICE_UNDETECTED => undef, + ICE_INFREQUENT_COUNT => 0, + EL_COUNT_NONEMPTY_ALL => 0, + EL_COUNT_NONEMPTY_SET1 => 0, + EL_COUNT_NONEMPTY_SET2 => 0, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ice + calc_nonempty_elements_used + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => \@nbr_set2, + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ICE_all_groups_empty', + ); + + return; +} + + +sub test_ICE_additional_empty_group { + my $bd = shift->clone; + + my $extra_gp_name = 'extra empty group'; + $bd->add_element (group => $extra_gp_name, count => 0); + + my $results2 = { + ICE_ESTIMATE => 29.606691, + ICE_SE => 3.130841, + ICE_VARIANCE => 9.8021639498, + ICE_CI_LOWER => 26.83023165, + ICE_CI_UPPER => 41.668186, + ICE_UNDETECTED => 3.606691, + ICE_INFREQUENT_COUNT => 24, + EL_COUNT_NONEMPTY_ALL => 11, + EL_COUNT_NONEMPTY_SET1 => 1, + EL_COUNT_NONEMPTY_SET2 => 10, + }; + + my %expected_results = (2 => $results2); + + run_indices_test1 ( + calcs_to_test => [qw/ + calc_ice + calc_nonempty_elements_used + /], + sort_array_lists => 1, + basedata_ref => $bd, + element_list1 => [$focal_gp], + element_list2 => [@nbr_set2, $extra_gp_name], + expected_results => \%expected_results, + skip_nbr_counts => {1 => 1}, + descr_suffix => 'test_ICE_additional_empty_group', + ); + + return; +} + + +sub get_basedata { + + my $data = get_data_section ('SAMPLE_DATA'); + + my $bd = get_basedata_object_from_mx_format ( + name => 'EstimateS', + CELL_SIZES => [-1], + data => $data, + data_in_matrix_form => 1, + label_start_col => 1, + input_sep_char => "\t", + ); + + return $bd->transpose; +} + + +1; + +__DATA__ + + +@@ SAMPLE_DATA +sp Broad_Meadow_Brook Cold_Brook Doyle_Center Drumlin_Farm Graves_Farm Ipswich_River Laughing_Brook Lowell_Holly Moose_Hill Nashoba_Brook Old_Town_Hill +aphful 0 0 0 0 0 0 0 1 0 0 1 +aphrud 4 13 5 4 7 7 10 16 8 12 13 +bradep 0 0 0 0 0 0 0 0 0 0 0 +camchr 0 0 0 0 0 0 4 0 1 0 0 +camher 0 1 0 0 0 0 0 0 0 0 0 +camnea 0 1 0 0 0 0 0 1 0 0 0 +camnov 0 0 0 0 0 0 0 0 0 0 0 +campen 4 2 1 6 1 9 6 5 7 10 6 +crelin 0 0 0 0 0 0 0 0 0 0 0 +dolpla 0 0 0 0 0 0 0 0 0 0 0 +forinc 0 0 0 0 0 0 0 0 0 0 0 +forlas 0 0 0 0 0 0 0 0 0 0 0 +forneo1 0 0 0 2 0 0 0 0 0 0 2 +forneo2 0 0 0 0 0 0 0 0 0 0 0 +fornep 0 0 0 0 0 0 0 0 0 0 0 +forper 0 0 0 0 0 0 0 0 2 0 0 +forsub1 0 0 0 0 0 2 0 0 1 0 0 +forsub3 2 0 0 0 0 9 1 2 4 1 0 +lasali 4 10 0 0 7 2 4 0 0 2 9 +lascla 1 0 0 1 0 0 0 0 2 2 0 +lasfla 0 0 0 0 0 0 0 0 0 0 0 +laslat 0 0 0 0 0 0 2 0 0 0 1 +lasnea 1 0 4 2 4 2 0 0 6 6 0 +lasneo 0 0 0 0 0 0 0 0 0 0 0 +lasspe 1 0 0 0 0 0 0 0 0 0 0 +lasumb 0 0 2 0 0 0 3 0 1 0 0 +myrame1 0 0 0 0 0 0 2 0 0 0 0 +myrame2 0 0 0 0 0 0 0 0 0 0 0 +myrdet 0 0 0 3 0 2 0 0 0 2 6 +myrinc 0 0 0 0 0 0 1 0 0 0 0 +myrnea 0 0 0 0 0 0 0 0 0 0 1 +myrpun 1 0 0 2 0 2 5 0 1 2 0 +myrrub 0 0 0 0 0 0 0 0 0 0 0 +ponpen 0 0 0 0 0 0 0 0 0 0 0 +preimp 0 0 0 0 0 0 0 0 0 0 0 +proame 0 0 0 0 0 0 0 1 0 0 1 +solmol 0 0 0 0 0 0 0 0 0 0 0 +stebre 0 0 0 0 0 0 0 0 0 0 0 +steimp 0 0 0 1 0 0 1 1 0 0 1 +tapses 0 0 0 0 0 0 2 0 3 0 2 +temamb 0 0 0 0 0 0 0 0 0 0 0 +temcur 0 0 0 2 0 0 0 0 0 1 0 +temlon 0 1 0 4 0 0 1 4 0 0 0 + + +@@ RESULTS_2_NBR_LISTS +{ ACE_CI_LOWER => '26.4817368225888', + ACE_CI_UPPER => '38.5616065911166', + ACE_ESTIMATE => '28.4599570008062', + ACE_INFREQUENT_COUNT => 19, + ACE_SE => '2.4572916222336', + ACE_UNDETECTED => '2.45995700080623', + ACE_VARIANCE => '6.03828211669945', + CHAO1_CI_LOWER => '26.2163119159043', + CHAO1_CI_UPPER => '37.7629272999573', + CHAO1_ESTIMATE => '27.5951367781155', + CHAO1_F1_COUNT => 4, + CHAO1_F2_COUNT => 5, + CHAO1_META => { + CHAO_FORMULA => 2, + CI_FORMULA => 13, + VARIANCE_FORMULA => 6 + }, + CHAO1_SE => '2.15603580378238', + CHAO1_UNDETECTED => '1.5951367781155', + CHAO1_VARIANCE => '4.64849038719155', + CHAO2_CI_LOWER => '26.3450800735193', + CHAO2_CI_UPPER => '38.1243868266594', + CHAO2_ESTIMATE => '28.0454545454545', + CHAO2_META => { + CHAO_FORMULA => 4, + CI_FORMULA => 13, + VARIANCE_FORMULA => 10 + }, + CHAO2_Q1_COUNT => 6, + CHAO2_Q2_COUNT => 8, + CHAO2_SE => '2.31466979955927', + CHAO2_UNDETECTED => '2.04545454545455', + CHAO2_VARIANCE => '5.35769628099174', + ICE_CI_LOWER => '26.8302316060467', + ICE_CI_UPPER => '41.6681855466098', + ICE_ESTIMATE => '29.6066913993576', + ICE_INFREQUENT_COUNT => 24, + ICE_SE => '3.13084077363682', + ICE_UNDETECTED => '3.60669139935755', + ICE_VARIANCE => '9.80216394986679' +} + + +@@ RESULTS_1_NBR_LISTS +{ ACE_CI_LOWER => '8.5996888836216', + ACE_CI_UPPER => '30.9753940794084', + ACE_ESTIMATE => '11.7118847539016', + ACE_INFREQUENT_COUNT => 8, + ACE_SE => '4.35262370708667', + ACE_UNDETECTED => '3.71188475390156', + ACE_VARIANCE => '18.9453331354929', + CHAO1_CI_LOWER => '8.93050951620995', + CHAO1_CI_UPPER => '69.3496356121155', + CHAO1_ESTIMATE => '15.5555555555556', + CHAO1_F1_COUNT => 4, + CHAO1_F2_COUNT => 1, + CHAO1_META => { + CHAO_FORMULA => 2, + CI_FORMULA => 13, + VARIANCE_FORMULA => 6 + }, + CHAO1_SE => '11.0330591887168', + CHAO1_UNDETECTED => '7.55555555555556', + CHAO1_VARIANCE => '121.728395061728', + CHAO2_CI_LOWER => undef, + CHAO2_CI_UPPER => undef, + CHAO2_ESTIMATE => 8, + CHAO2_META => { + CHAO_FORMULA => 4, + CI_FORMULA => 13, + VARIANCE_FORMULA => 11 + }, + CHAO2_Q1_COUNT => 8, + CHAO2_Q2_COUNT => 0, + CHAO2_SE => '0', + CHAO2_UNDETECTED => 0, + CHAO2_VARIANCE => 0, + ICE_CI_LOWER => undef, + ICE_CI_UPPER => undef, + ICE_ESTIMATE => 8, + ICE_INFREQUENT_COUNT => 8, + ICE_SE => undef, + ICE_UNDETECTED => undef, + ICE_VARIANCE => undef +} + + diff --git a/t/EstimateS_data.txt b/t/EstimateS_data.txt new file mode 100644 index 000000000..11039f773 --- /dev/null +++ b/t/EstimateS_data.txt @@ -0,0 +1,44 @@ +sp Broad_Meadow_Brook Cold_Brook Doyle_Center Drumlin_Farm Graves_Farm Ipswich_River Laughing_Brook Lowell_Holly Moose_Hill Nashoba_Brook Old_Town_Hill +aphful 0 0 0 0 0 0 0 1 0 0 1 +aphrud 4 13 5 4 7 7 10 16 8 12 13 +bradep 0 0 0 0 0 0 0 0 0 0 0 +camchr 0 0 0 0 0 0 4 0 1 0 0 +camher 0 1 0 0 0 0 0 0 0 0 0 +camnea 0 1 0 0 0 0 0 1 0 0 0 +camnov 0 0 0 0 0 0 0 0 0 0 0 +campen 4 2 1 6 1 9 6 5 7 10 6 +crelin 0 0 0 0 0 0 0 0 0 0 0 +dolpla 0 0 0 0 0 0 0 0 0 0 0 +forinc 0 0 0 0 0 0 0 0 0 0 0 +forlas 0 0 0 0 0 0 0 0 0 0 0 +forneo1 0 0 0 2 0 0 0 0 0 0 2 +forneo2 0 0 0 0 0 0 0 0 0 0 0 +fornep 0 0 0 0 0 0 0 0 0 0 0 +forper 0 0 0 0 0 0 0 0 2 0 0 +forsub1 0 0 0 0 0 2 0 0 1 0 0 +forsub3 2 0 0 0 0 9 1 2 4 1 0 +lasali 4 10 0 0 7 2 4 0 0 2 9 +lascla 1 0 0 1 0 0 0 0 2 2 0 +lasfla 0 0 0 0 0 0 0 0 0 0 0 +laslat 0 0 0 0 0 0 2 0 0 0 1 +lasnea 1 0 4 2 4 2 0 0 6 6 0 +lasneo 0 0 0 0 0 0 0 0 0 0 0 +lasspe 1 0 0 0 0 0 0 0 0 0 0 +lasumb 0 0 2 0 0 0 3 0 1 0 0 +myrame1 0 0 0 0 0 0 2 0 0 0 0 +myrame2 0 0 0 0 0 0 0 0 0 0 0 +myrdet 0 0 0 3 0 2 0 0 0 2 6 +myrinc 0 0 0 0 0 0 1 0 0 0 0 +myrnea 0 0 0 0 0 0 0 0 0 0 1 +myrpun 1 0 0 2 0 2 5 0 1 2 0 +myrrub 0 0 0 0 0 0 0 0 0 0 0 +ponpen 0 0 0 0 0 0 0 0 0 0 0 +preimp 0 0 0 0 0 0 0 0 0 0 0 +proame 0 0 0 0 0 0 0 1 0 0 1 +solmol 0 0 0 0 0 0 0 0 0 0 0 +stebre 0 0 0 0 0 0 0 0 0 0 0 +steimp 0 0 0 1 0 0 1 1 0 0 1 +tapses 0 0 0 0 0 0 2 0 3 0 2 +temamb 0 0 0 0 0 0 0 0 0 0 0 +temcur 0 0 0 2 0 0 0 0 0 1 0 +temlon 0 1 0 4 0 0 1 4 0 0 0 diff --git a/t/lib/Biodiverse/TestHelpers.pm b/t/lib/Biodiverse/TestHelpers.pm index fd64d97c0..12c240b9b 100644 --- a/t/lib/Biodiverse/TestHelpers.pm +++ b/t/lib/Biodiverse/TestHelpers.pm @@ -55,6 +55,7 @@ use Exporter::Easy ( get_basedata_object get_basedata_object_from_site_data get_numeric_labels_basedata_object_from_site_data + get_basedata_object_from_mx_format :utils ), ], @@ -508,6 +509,27 @@ sub get_basedata_object { return $bd; } +sub get_basedata_object_from_mx_format { + my %args = @_; + + my $bd_f = get_basedata_import_data_file(@_); + + print "Temp file is $bd_f\n"; + + my $bd = Biodiverse::BaseData->new( + CELL_SIZES => $args{CELL_SIZES}, + NAME => 'Test basedata', + ); + $bd->import_data( + input_files => [$bd_f], + label_columns => [], + group_columns => [0], + %args, + ); + + return $bd; +} + sub get_basedata_object_from_site_data { my %args = @_; @@ -757,11 +779,13 @@ sub run_indices_test1 { my $use_label_properties_extra = $args{use_label_properties_extra}; # boolean my $use_label_properties_binomial = $args{use_label_properties_binomial}; # boolean my $callbacks = $args{callbacks}; + my $expected_results = $args{expected_results} // {}; my $expected_results_overlay = $args{expected_results_overlay}; my $sort_array_lists = $args{sort_array_lists}; my $precision = $args{precisions} // '%10f'; # compare numeric values to 10 dp. my $descr_suffix = $args{descr_suffix} // ''; my $processing_element = $args{processing_element} // '3350000:850000'; + my $skip_nbr_counts = $args{skip_nbr_counts} // {}; delete $args{callbacks}; # Used for acquiring sample results @@ -783,13 +807,14 @@ sub run_indices_test1 { my %bd_args = (%args, CELL_SIZES => $cell_sizes); - my $bd = $use_numeric_labels - ? get_numeric_labels_basedata_object_from_site_data ( - %bd_args, - ) - : get_basedata_object_from_site_data ( - %bd_args, - ); + my $bd = $args{basedata_ref}; + $bd ||= $use_numeric_labels + ? get_numeric_labels_basedata_object_from_site_data ( + %bd_args, + ) + : get_basedata_object_from_site_data ( + %bd_args, + ); if ($args{nbr_set2_sp_select_all}) { # get all groups, but ensure no overlap with NS1 @@ -908,11 +933,14 @@ sub run_indices_test1 { my %results_by_nbr_list; + NBR_COUNT: foreach my $nbr_list_count (2, 1) { if ($nbr_list_count == 1) { delete $elements{element_list2}; } + next NBR_COUNT if $skip_nbr_counts->{$nbr_list_count}; + my %indices_args = ( calcs_to_test => $calcs_to_test, calc_args => $calc_args, @@ -929,9 +957,10 @@ sub run_indices_test1 { # now we need to check the results my $subtest_name = "Result set matches for neighbour count $nbr_list_count"; - my $expected = eval $dss->get_data_section( - "RESULTS_${nbr_list_count}_NBR_LISTS" - ); + my $expected = $expected_results->{$nbr_list_count} + // eval $dss->get_data_section( + "RESULTS_${nbr_list_count}_NBR_LISTS" + ); diag "Problem with data section: $EVAL_ERROR" if $EVAL_ERROR; if ($expected_results_overlay && $expected_results_overlay->{$nbr_list_count}) { my $hash = $expected_results_overlay->{$nbr_list_count};